Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/main/scala/tech/sourced/gemini/Gemini.scala
Original file line number Diff line number Diff line change
Expand Up @@ -108,18 +108,17 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
* It is used one query per distinct file
*
* @param conn Database connections
* @param format Duplicated items mode
* @param advancedCql use advanced cql or not (supported only by Apache Cassandra)
* @param ccDirPath directory for connected components
* @return
*/
def report(conn: Session, format: String, ccDirPath: String): ReportResult = {
def report(conn: Session, advancedCql: Boolean, ccDirPath: String): ReportResult = {
val report = new Report(conn, log, keyspace, tables)

log.info(s"Report duplicate items from DB $keyspace")
val duplicates = format match {
case ReportApp.defaultFmt => ReportExpandedGroup(report.findAllDuplicateItems())
case ReportApp.defaultFmtGroupBy => ReportExpandedGroup(report.reportCassandraGroupBy())
case ReportApp.condensedFmt => ReportGrouped(report.reportCassandraCondensed())
val duplicates = advancedCql match {
case false => ReportDuplicates(report.findAllDuplicateItems())
case true => ReportDuplicates(report.reportCassandraGroupBy())
}
log.info(s"${duplicates.size} duplicate SHA1s")

Expand Down
21 changes: 2 additions & 19 deletions src/main/scala/tech/sourced/gemini/Report.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,33 +21,16 @@ import scala.sys.process._
*/
case class ReportResult(duplicates: ReportDuplicates, similar: Iterable[Iterable[SimilarItem]])

sealed abstract class ReportDuplicates(v: Iterable[Any]) {
case class ReportDuplicates(v: Iterable[Iterable[RepoFile]]) {
def empty(): Boolean = {
v.isEmpty
}

def size(): Int = v.size
}

case class ReportByLine(v: Iterable[RepoFile]) extends ReportDuplicates(v)

case class ReportGrouped(v: Iterable[DuplicateBlobHash]) extends ReportDuplicates(v)

case class ReportExpandedGroup(v: Iterable[Iterable[RepoFile]]) extends ReportDuplicates(v)

class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) {

/**
* Finds duplicate files among hashed repositories
* It is used only one query
* (Only supported by Apache Cassandra databases)
*
* @return
*/
def reportCassandraCondensed(): Iterable[DuplicateBlobHash] = {
findAllDuplicateBlobHashes()
}

/**
* Finds duplicate files among hashed repositories
* It is used one query per unique duplicate file, plus an extra one
Expand All @@ -56,7 +39,7 @@ class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables)
* @return
*/
def reportCassandraGroupBy(): Iterable[Iterable[RepoFile]] = {
reportCassandraCondensed()
findAllDuplicateBlobHashes()
.map { item =>
Database.findFilesByHash(item.sha, conn, keyspace, tables)
}
Expand Down
82 changes: 56 additions & 26 deletions src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package tech.sourced.gemini.cmd

import com.datastax.driver.core.Cluster
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import tech.sourced.gemini._
import tech.sourced.gemini.util.Logger

Expand All @@ -9,16 +12,21 @@ case class ReportAppConfig(
host: String = Gemini.defaultCassandraHost,
port: Int = Gemini.defaultCassandraPort,
keyspace: String = Gemini.defautKeyspace,
format: String = ReportApp.defaultFmt,
ccDirPath: String = ".",
verbose: Boolean = false,
mode: String = Gemini.fileSimilarityMode
mode: String = Gemini.fileSimilarityMode,
output: String = ReportApp.defaultOutput,
cassandra: Boolean = false,
verbose: Boolean = false
)

object ReportApp extends App {
val outputText = "text"
val outputJson = "json"
val outputs = Array(outputText, outputJson)
val defaultOutput = outputText

val defaultFmt = ""
val defaultFmtGroupBy = "use-group-by"
val condensedFmt = "condensed"

val parser = new Parser[ReportAppConfig]("./report") {
head("Gemini Report")
Expand All @@ -40,12 +48,19 @@ object ReportApp extends App {
opt[Unit]('v', "verbose")
.action((_, c) => c.copy(verbose = true))
.text("producing more verbose debug output")
opt[String]("format")
.valueName("use-group-by or condensed")
.action((x, c) => c.copy(format = x))
.text("Only for Apache Cassandra database\n" +
"use-group-by - use as many queries as unique duplicate files are found, plus one.\n" +
"condensed - use only one query to find the duplicates.")
opt[String]("output-format")
.valueName(outputs.mkString(" | "))
.validate(x =>
if (outputs contains x) {
success
} else {
failure(s"output must be one of: " + outputs.mkString(" | "))
})
.action((x, c) => c.copy(output = x))
.text("output format")
opt[Boolean]("cassandra")
.action((x, c) => c.copy(cassandra = x))
.text("Enable advanced cql queries for Apache Cassandra database")
}

parser.parseWithEnv(args, ReportAppConfig()) match {
Expand All @@ -64,10 +79,12 @@ object ReportApp extends App {
log.info("Checking DB schema")
gemini.applySchema(cassandra)

val ReportResult(duplicates, similarities) = gemini.report(cassandra, config.format, config.ccDirPath)
val result = gemini.report(cassandra, config.cassandra, config.ccDirPath)

print(duplicates)
printCommunities(similarities)
config.output match {
case `outputText` => printAsText(result)
case `outputJson` => printAsJson(result)
}

log.info("Closing DB connection")
cassandra.close()
Expand All @@ -77,23 +94,22 @@ object ReportApp extends App {
System.exit(2)
}

def print(report: ReportDuplicates): Unit = {
report match {
case e if e.empty() => println(s"No duplicated files found.")
case ReportGrouped(v) => println(s"Duplicated files found:\n\t" + (v mkString "\n\t"))
case ReportExpandedGroup(v) =>
v.foreach { item =>
val count = item.size
println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n")
}
def printAsText(result: ReportResult): Unit = {
val ReportResult(duplicates, similarities) = result

if (duplicates.empty()) {
println(s"No duplicated files found.")
} else {
duplicates.v.foreach { item =>
val count = item.size
println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n")
}
}
}

def printCommunities(report: Iterable[Iterable[SimilarItem]]): Unit = {
if (report.isEmpty) {
if (similarities.isEmpty) {
println(s"No similarities found.")
} else {
report.foreach { community =>
similarities.foreach { community =>
val count = community.size
val typeName = community.head match {
case SimilarFunc(_, _, _) => "functions"
Expand All @@ -103,4 +119,18 @@ object ReportApp extends App {
}
}
}

def printAsJson(result: ReportResult): Unit = {
val ReportResult(duplicates, similarities) = result

val mapper = new ObjectMapper() with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)

val str = mapper.writeValueAsString(Map(
"duplicates" -> duplicates.v,
"similarities" -> similarities
))
println(str)
}

}
11 changes: 0 additions & 11 deletions src/test/scala/tech/sourced/gemini/ReportSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,6 @@ class ReportSpec extends FlatSpec
super.afterAll()
}

"Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in {
val report = new Report(session, logger, DUPLICATES, Gemini.tables)

println("Query")
val result = report.reportCassandraCondensed()
println("Done")

result should have size expectedDuplicateFiles.size
result foreach (_.count should be(2))
}

"Detailed Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in {
val report = new Report(session, logger, DUPLICATES, Gemini.tables)

Expand Down