Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions src/main/scala/tech/sourced/gemini/Hash.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.cassandra._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.{udf => sparkUdf} // udf name conflicts with engine
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.bblfsh.client.BblfshClient
Expand Down Expand Up @@ -42,6 +43,9 @@ class Hash(session: SparkSession,
mode: String = Gemini.fileSimilarityMode,
docFreqPath: String = "") {

// very small files produce too much false positives
val fileSizeThresholdBytes = 500

import session.implicits._

def report(header: String, countProcessed: Long, skipped: MapAccumulator): Unit = {
Expand Down Expand Up @@ -120,20 +124,31 @@ class Hash(session: SparkSession,
protected def filesForRepos(repos: DataFrame): DataFrame = {
log.warn("Listing files")

val fileSizeUdf = sparkUdf { (content: Array[Byte]) => content.size }

repos
.getHEAD
.getCommits
.getTreeEntries
.getBlobs
.filter(r => !Enry.isVendor(r.getAs[String]("path")))
.filter('is_binary === false)
.filter(r => !Enry.isVendor(r.getAs[String]("path")))
.withColumn("content_size", fileSizeUdf('content))
.filter('content_size !== 0) // empty files only pollute results
}

protected def extractUast(files: DataFrame): DataFrame = {
log.warn("Extracting UASTs")

files
.dropDuplicates("blob_id")
val blobs = files.dropDuplicates("blob_id")

val filteredBlobs = if (mode == Gemini.fileSimilarityMode) {
blobs.filter('content_size > fileSizeThresholdBytes)
} else {
blobs
}

filteredBlobs
.classifyLanguages
.filter('lang.isNotNull)
.extractUASTs
Expand Down
6 changes: 3 additions & 3 deletions src/test/scala/tech/sourced/gemini/ReportSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ class ReportSpec extends FlatSpec
val similarGroups = report.findSimilarItems("/tmp/report-files-test", Gemini.fileSimilarityMode)
println("Done")

similarGroups should have size 6
similarGroups should have size 7

val files = similarGroups.head.map(_.toString)
files.toSeq should contain theSameElementsAs Seq(
"https://github.com/erizocosmico/borges/blob/b1fcd3bf0ba810c05cb418babc09cc7f7783cc03/fixtures_test.go",
"https://github.com/src-d/borges/blob/e784f9d5f59d5c081c5f8f71b6c517918b899df0/fixtures_test.go"
"https://github.com/erizocosmico/borges/blob/b1fcd3bf0ba810c05cb418babc09cc7f7783cc03/consumer_test.go",
"https://github.com/src-d/borges/blob/e784f9d5f59d5c081c5f8f71b6c517918b899df0/consumer_test.go"
)
}

Expand Down