Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Rooted repositories have a few particularities that you should know to work with

gitcollector entry point usage is done through the subcommand `download` (at this time is the only subcommand):

```
```txt
Usage:
gitcollector [OPTIONS] download [download-OPTIONS]

Expand All @@ -43,7 +43,9 @@ Help Options:
--bucket= library bucketization level (default: 2) [$GITCOLLECTOR_LIBRARY_BUCKET]
--tmp= directory to place generated temporal files (default: /tmp) [$GITCOLLECTOR_TMP]
--workers= number of workers, default to GOMAXPROCS [$GITCOLLECTOR_WORKERS]
--half-cpu set the number of workers to half of the set workers [$GITCOLLECTOR_HALF_CPU]
--no-updates don't allow updates on already downloaded repositories [$GITCOLLECTOR_NO_UPDATES]
--no-forks github forked repositories will not be downloaded [$GITCOLLECTOR_NO_FORKS]
--orgs= list of github organization names separated by comma [$GITHUB_ORGANIZATIONS]
--token= github token [$GITHUB_TOKEN]
--metrics-db= uri to a database where metrics will be sent [$GITCOLLECTOR_METRICS_DB_URI]
Expand Down
8 changes: 6 additions & 2 deletions cmd/gitcollector/subcmd/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type DownloadCmd struct {
Workers int `long:"workers" description:"number of workers, default to GOMAXPROCS" env:"GITCOLLECTOR_WORKERS"`
HalfCPU bool `long:"half-cpu" description:"set the number of workers to half of the set workers" env:"GITCOLLECTOR_HALF_CPU"`
NotAllowUpdates bool `long:"no-updates" description:"don't allow updates on already downloaded repositories" env:"GITCOLLECTOR_NO_UPDATES"`
NoForks bool `long:"no-forks" description:"github forked repositories will not be downloaded" env:"GITCOLLECTOR_NO_FORKS"`
Orgs string `long:"orgs" env:"GITHUB_ORGANIZATIONS" description:"list of github organization names separated by comma" required:"true"`
Token string `long:"token" env:"GITHUB_TOKEN" description:"github token"`
MetricsDBURI string `long:"metrics-db" env:"GITCOLLECTOR_METRICS_DB_URI" description:"uri to a database where metrics will be sent"`
Expand Down Expand Up @@ -139,7 +140,7 @@ func (c *DownloadCmd) Execute(args []string) error {
wp.Run()
log.Debugf("worker pool is running")

go runGHOrgProviders(log.New(nil), orgs, c.Token, download)
go runGHOrgProviders(log.New(nil), orgs, c.Token, download, c.NoForks)

wp.Wait()
log.Debugf("worker pool stopped successfully")
Expand Down Expand Up @@ -183,6 +184,7 @@ func runGHOrgProviders(
orgs []string,
token string,
download chan gitcollector.Job,
skipForks bool,
) {
var wg sync.WaitGroup
wg.Add(len(orgs))
Expand All @@ -196,7 +198,9 @@ func runGHOrgProviders(
AuthToken: token,
},
),
&discovery.GHProviderOpts{},
&discovery.GHProviderOpts{
SkipForks: skipForks,
},
)

go func() {
Expand Down
5 changes: 5 additions & 0 deletions discovery/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ var (

// GHProviderOpts represents configuration options for a GHProvider.
type GHProviderOpts struct {
SkipForks bool
WaitNewRepos bool
WaitOnRateLimit bool
StopTimeout time.Duration
Expand Down Expand Up @@ -158,6 +159,10 @@ func (p *GHProvider) enqueueJob(ctx context.Context) error {
return nil
}

if p.opts.SkipForks && repo.GetFork() {
return nil
}

endpoint, err := getEndpoint(repo)
if err != nil {
return nil
Expand Down
60 changes: 59 additions & 1 deletion discovery/provider_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package discovery

import (
"fmt"
"os"
"strings"
"testing"
"time"
Expand All @@ -19,13 +21,14 @@ func TestGHProvider(t *testing.T) {
timeToStop = 5 * time.Second
)

token, _ := testToken()
queue := make(chan gitcollector.Job, 50)
provider := NewGHProvider(
queue,
NewGHOrgReposIter(org, &GHReposIterOpts{
TimeNewRepos: 1 * time.Second,
ResultsPerPage: 100,
AuthToken: "",
AuthToken: token,
}),
&GHProviderOpts{
MaxJobBuffer: 50,
Expand Down Expand Up @@ -72,3 +75,58 @@ func TestGHProvider(t *testing.T) {
req.True(strings.Contains(job.Endpoints[0], org))
}
}

func TestGHProviderSkipForks(t *testing.T) {
var req = require.New(t)
const org = "src-d"

token, skip := testToken()
if skip != nil {
t.Skip(skip.Error())
}

queue := make(chan gitcollector.Job, 200)
provider := NewGHProvider(
queue,
NewGHOrgReposIter(org, &GHReposIterOpts{
AuthToken: token,
}),
&GHProviderOpts{
SkipForks: true,
MaxJobBuffer: 50,
},
)

done := make(chan struct{})
var err error
go func() {
err = provider.Start()
close(done)
}()

<-done
req.True(ErrNewRepositoriesNotFound.Is(err), err.Error())
close(queue)
forkedRepos := []string{"or-tools", "PyHive", "go-oniguruma"}
for job := range queue {
j, ok := job.(*library.Job)
req.True(ok)
req.Len(j.Endpoints, 1)

for _, forked := range forkedRepos {
req.False(strings.Contains(j.Endpoints[0], forked))
}
}
}

func testToken() (string, error) {
token := os.Getenv("GITHUB_TOKEN")
ci := os.Getenv("TRAVIS")
var err error
if token == "" && ci == "true" {
err = fmt.Errorf("test running on travis CI but " +
"couldn't find GITHUB_TOKEN")
}

return token, err
}
1 change: 0 additions & 1 deletion library/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ func NewJobScheduleFn(
}

if errClosedChan.Is(err) {
println("CLOSE")
download = nil
}
}
Expand Down