Skip to content
22 changes: 22 additions & 0 deletions licensedb/internal/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,3 +460,25 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
}
return weight
}

func (db *database) QuerySourceFile(text string) map[string]float32 {
candidates := map[string]float32{}
append := func(others map[string]float32) {
for key, val := range others {
if candidates[key] < val {
candidates[key] = val
}
}
}
append(db.QueryLicenseText(string(text)))
if len(candidates) == 0 {
// TO DO: split license-comments from description-comments.
Copy link
Author

@merybenavente merybenavente Jul 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As all of the files in the project are scanned for licenses I though about doing this split here for performance's sake, what do you think? Should it be done before to improve % on the output? If the license is clear, like on this project, is is found without problem.

}
if db.debug {
for key, val := range candidates {
println("NLP", key, val)
}
}
db.addURLMatches(candidates, text)
return candidates
}
106 changes: 106 additions & 0 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
"gopkg.in/src-d/enry.v1"
)

var (
Expand Down Expand Up @@ -62,6 +63,34 @@ var (

licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
"^(%s)$", strings.Join(licenseFileNames, "|")))

commentSyntaxes = map[string]*regexp.Regexp {
"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"R": regexp.MustCompile(`#.*\t?\r?\n?`),
"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
}
)

// ExtractLicenseFiles returns the list of possible license texts.
Expand Down Expand Up @@ -157,3 +186,80 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
func IsLicenseDirectory(fileName string) bool {
return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
}

// ExtractSourceFiles searches for source code files and their returns header comments, when available.
// Enry is used to get possible valuable files.
func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
candidates := [][]byte{}
langs := []string{}
var empty []byte
for _, file := range files {
text, err := fs.ReadFile(file)
if err == nil {
lang := enry.GetLanguage(file, empty)
langs = append(langs, lang)
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
}
}
if len(candidates) > 0 {
candidates = ExtractHeaderComments(candidates, langs)
}
return candidates
}

// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
comments := [][]byte{}
var unsupportedTypes string
for i, candidate := range candidates {
candidateLang := langs[i]
candidateHeader := candidate
if len(candidateHeader) > 1024 {
candidateHeader = candidate[:1024]
}
if reg, exists := commentSyntaxes[candidateLang]; exists {
if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
var matchText string
for _, m := range match {
matchText += m
}
comments = append(comments, []byte(matchText))
}
} else {
match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
if match == false {
unsupportedTypes += candidateLang + ", "
}
}
}
if len(unsupportedTypes) > 0 {
unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
}
return comments
}

// InvestigateHeaderComments scans the header comments for licensing information and outputs the
// probable names using NER.
func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateHeaderComment(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
}

// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateHeaderComment(text []byte) map[string]float32 {
return globalLicenseDatabase().QuerySourceFile(string(text))
}
30 changes: 27 additions & 3 deletions licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,36 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
}
// Plan B: take the README, find the section about the license and apply NER
candidates = internal.ExtractReadmeFiles(fileNames, fs)
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
if len(candidates) > 0 {
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) > 0 {
return licenses, nil
}
}
// Plan C: look for licence texts in source code files with comments at header
var extendedFileNames []string
extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
candidates = internal.ExtractSourceFiles(extendedFileNames, fs)
if len(candidates) > 0 {
licenses = internal.InvestigateHeaderComments(candidates, fs)
}
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
}
return licenses, nil
}

func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
files, err := fs.ReadDir(path)
if err == nil {
for _, subfile := range files {
currentPath := paths.Join(path, subfile.Name)
if subfile.IsDir {
fileNames = extractAllSubfiles(fs, fileNames, currentPath)
} else {
fileNames = append(fileNames, currentPath)
}
}
}
return fileNames
}