src-d · merybenavente · Jul 17, 2018 · Jul 18, 2018 · Jul 20, 2018 · Jul 20, 2018
diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go
@@ -460,3 +460,25 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
 	}
 	return weight
 }
+
+func (db *database) QuerySourceFile(text string) map[string]float32 {
+	candidates := map[string]float32{}
+	append := func(others map[string]float32) {
+		for key, val := range others {
+			if candidates[key] < val {
+				candidates[key] = val
+			}
+		}
+	}
+	append(db.QueryLicenseText(string(text)))
+	if len(candidates) == 0 {
+		// TO DO: split license-comments from description-comments.
+	}
+	if db.debug {
+		for key, val := range candidates {
+			println("NLP", key, val)
+		}
+	}
+	db.addURLMatches(candidates, text)
+	return candidates
+}
diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
@@ -10,6 +10,7 @@ import (
 
 	"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
 	"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
+	"gopkg.in/src-d/enry.v1"
 )
 
 var (
@@ -62,6 +63,34 @@ var (
 
 	licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
 		"^(%s)$", strings.Join(licenseFileNames, "|")))
+
+	commentSyntaxes = map[string]*regexp.Regexp {
+		"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
+		"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
+		"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
+		"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
+		"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
+		"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
+		"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
+		"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
+		"R": regexp.MustCompile(`#.*\t?\r?\n?`),
+		"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
+		"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
+	}
 )
 
 // ExtractLicenseFiles returns the list of possible license texts.
@@ -157,3 +186,80 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
 func IsLicenseDirectory(fileName string) bool {
 	return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
 }
+
+// ExtractSourceFiles searches for source code files and their returns header comments, when available.
+// Enry is used to get possible valuable files.
+func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
+	candidates := [][]byte{}
+	langs := []string{}
+	var empty []byte
+	for _, file := range files {
+		text, err := fs.ReadFile(file)
+		if err == nil {
+			lang := enry.GetLanguage(file, empty)
+			langs = append(langs, lang)
+			if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
+				text = preprocessor(text)
+			}
+			candidates = append(candidates, text)
+		}
+	}
+	if len(candidates) > 0 {
+		candidates = ExtractHeaderComments(candidates, langs)
+	}
+	return candidates
+}
+
+// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
+func ExtractHeaderComments(candidates [][]byte, langs []string) [][]byte {
+	comments := [][]byte{}
+	var unsupportedTypes string
+	for i, candidate := range candidates {
+		candidateLang := langs[i]
+		candidateHeader := candidate
+		if len(candidateHeader) > 1024 {
+			candidateHeader = candidate[:1024]
+		}
+		if reg, exists := commentSyntaxes[candidateLang]; exists {
+			if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
+				var matchText string
+				for _, m := range match {
+					matchText += m
+				}
+				comments = append(comments, []byte(matchText))
+			}
+		} else {
+			match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
+			if match == false {
+				unsupportedTypes += candidateLang + ", "
+			}
+		}
+	}
+	if len(unsupportedTypes) > 0 {
+		unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
+		fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
+	}
+	return comments
+}
+
+// InvestigateHeaderComments scans the header comments for licensing information and outputs the
+// probable names using NER.
+func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
+	maxLicenses := map[string]float32{}
+	for _, text := range texts {
+		candidates := InvestigateHeaderComment(text)
+		for name, sim := range candidates {
+			maxSim := maxLicenses[name]
+			if sim > maxSim {
+				maxLicenses[name] = sim
+			}
+		}
+	}
+	return maxLicenses
+}
+
+// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
+// names found with Named Entity Recognition from NLP.
+func InvestigateHeaderComment(text []byte) map[string]float32 {
+	return globalLicenseDatabase().QuerySourceFile(string(text))
+}
diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go
@@ -43,12 +43,36 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
 	}
 	// Plan B: take the README, find the section about the license and apply NER
 	candidates = internal.ExtractReadmeFiles(fileNames, fs)
-	if len(candidates) == 0 {
-		return nil, ErrNoLicenseFound
+	if len(candidates) > 0 {
+		licenses = internal.InvestigateReadmeTexts(candidates, fs)
+		if len(licenses) > 0 {
+			return licenses, nil
+		}
+	}
+	// Plan C: look for licence texts in source code files with comments at header
+	var extendedFileNames []string
+	extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
+	candidates = internal.ExtractSourceFiles(extendedFileNames, fs)
+	if len(candidates) > 0 {
+		licenses = internal.InvestigateHeaderComments(candidates, fs)
 	}
-	licenses = internal.InvestigateReadmeTexts(candidates, fs)
 	if len(licenses) == 0 {
 		return nil, ErrNoLicenseFound
 	}
 	return licenses, nil
 }
+
+func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
+	files, err := fs.ReadDir(path)
+	if err == nil {
+		for _, subfile := range files {
+			currentPath := paths.Join(path, subfile.Name)
+			if subfile.IsDir {
+				fileNames = extractAllSubfiles(fs, fileNames, currentPath)
+			} else {
+				fileNames = append(fileNames, currentPath)
+			}
+		}
+	}
+	return fileNames
+}