src-d · smacker · Jan 25, 2019 · Jan 22, 2019 · Jan 22, 2019 · Jan 22, 2019
diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -108,18 +108,17 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
     * It is used one query per distinct file
     *
     * @param conn Database connections
-    * @param format Duplicated items mode
+    * @param advancedCql use advanced cql or not (supported only by Apache Cassandra)
     * @param ccDirPath directory for connected components
     * @return
     */
-  def report(conn: Session, format: String, ccDirPath: String): ReportResult = {
+  def report(conn: Session, advancedCql: Boolean, ccDirPath: String): ReportResult = {
     val report = new Report(conn, log, keyspace, tables)
 
     log.info(s"Report duplicate items from DB $keyspace")
-    val duplicates = format match {
-      case ReportApp.defaultFmt => ReportExpandedGroup(report.findAllDuplicateItems())
-      case ReportApp.defaultFmtGroupBy => ReportExpandedGroup(report.reportCassandraGroupBy())
-      case ReportApp.condensedFmt => ReportGrouped(report.reportCassandraCondensed())
+    val duplicates = advancedCql match {
+      case false => ReportDuplicates(report.findAllDuplicateItems())
+      case true => ReportDuplicates(report.reportCassandraGroupBy())
     }
     log.info(s"${duplicates.size} duplicate SHA1s")
 

diff --git a/src/main/scala/tech/sourced/gemini/Report.scala b/src/main/scala/tech/sourced/gemini/Report.scala
@@ -21,33 +21,16 @@ import scala.sys.process._
   */
 case class ReportResult(duplicates: ReportDuplicates, similar: Iterable[Iterable[SimilarItem]])
 
-sealed abstract class ReportDuplicates(v: Iterable[Any]) {
+case class ReportDuplicates(v: Iterable[Iterable[RepoFile]]) {
   def empty(): Boolean = {
     v.isEmpty
   }
 
   def size(): Int = v.size
 }
 
-case class ReportByLine(v: Iterable[RepoFile]) extends ReportDuplicates(v)
-
-case class ReportGrouped(v: Iterable[DuplicateBlobHash]) extends ReportDuplicates(v)
-
-case class ReportExpandedGroup(v: Iterable[Iterable[RepoFile]]) extends ReportDuplicates(v)
-
 class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) {
 
-  /**
-    * Finds duplicate files among hashed repositories
-    * It is used only one query
-    * (Only supported by Apache Cassandra databases)
-    *
-    * @return
-    */
-  def reportCassandraCondensed(): Iterable[DuplicateBlobHash] = {
-    findAllDuplicateBlobHashes()
-  }
-
   /**
     * Finds duplicate files among hashed repositories
     * It is used one query per unique duplicate file, plus an extra one
@@ -56,7 +39,7 @@ class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables)
     * @return
     */
   def reportCassandraGroupBy(): Iterable[Iterable[RepoFile]] = {
-    reportCassandraCondensed()
+    findAllDuplicateBlobHashes()
       .map { item =>
         Database.findFilesByHash(item.sha, conn, keyspace, tables)
       }

diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala
@@ -1,6 +1,9 @@
 package tech.sourced.gemini.cmd
 
 import com.datastax.driver.core.Cluster
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
 import tech.sourced.gemini._
 import tech.sourced.gemini.util.Logger
 
@@ -9,16 +12,21 @@ case class ReportAppConfig(
   host: String = Gemini.defaultCassandraHost,
   port: Int = Gemini.defaultCassandraPort,
   keyspace: String = Gemini.defautKeyspace,
-  format: String = ReportApp.defaultFmt,
   ccDirPath: String = ".",
-  verbose: Boolean = false,
-  mode: String = Gemini.fileSimilarityMode
+  mode: String = Gemini.fileSimilarityMode,
+  output: String = ReportApp.defaultOutput,
+  cassandra: Boolean = false,
+  verbose: Boolean = false
 )
 
 object ReportApp extends App {
+  val outputText = "text"
+  val outputJson = "json"
+  val outputs = Array(outputText, outputJson)
+  val defaultOutput = outputText
+
   val defaultFmt = ""
   val defaultFmtGroupBy = "use-group-by"
-  val condensedFmt = "condensed"
 
   val parser = new Parser[ReportAppConfig]("./report") {
     head("Gemini Report")
@@ -40,12 +48,19 @@ object ReportApp extends App {
     opt[Unit]('v', "verbose")
       .action((_, c) => c.copy(verbose = true))
       .text("producing more verbose debug output")
-    opt[String]("format")
-      .valueName("use-group-by or condensed")
-      .action((x, c) => c.copy(format = x))
-      .text("Only for Apache Cassandra database\n" +
-        "use-group-by - use as many queries as unique duplicate files are found, plus one.\n" +
-        "condensed - use only one query to find the duplicates.")
+    opt[String]("output-format")
+      .valueName(outputs.mkString(" | "))
+      .validate(x =>
+        if (outputs contains x) {
+          success
+        } else {
+          failure(s"output must be one of: " + outputs.mkString(" | "))
+        })
+      .action((x, c) => c.copy(output = x))
+      .text("output format")
+    opt[Boolean]("cassandra")
+      .action((x, c) => c.copy(cassandra = x))
+      .text("Enable advanced cql queries for Apache Cassandra database")
   }
 
   parser.parseWithEnv(args, ReportAppConfig()) match {
@@ -64,10 +79,12 @@ object ReportApp extends App {
       log.info("Checking DB schema")
       gemini.applySchema(cassandra)
 
-      val ReportResult(duplicates, similarities) = gemini.report(cassandra, config.format, config.ccDirPath)
+      val result = gemini.report(cassandra, config.cassandra, config.ccDirPath)
 
-      print(duplicates)
-      printCommunities(similarities)
+      config.output match {
+        case `outputText` => printAsText(result)
+        case `outputJson` => printAsJson(result)
+      }
 
       log.info("Closing DB connection")
       cassandra.close()
@@ -77,23 +94,22 @@ object ReportApp extends App {
       System.exit(2)
   }
 
-  def print(report: ReportDuplicates): Unit = {
-    report match {
-      case e if e.empty() => println(s"No duplicated files found.")
-      case ReportGrouped(v) => println(s"Duplicated files found:\n\t" + (v mkString "\n\t"))
-      case ReportExpandedGroup(v) =>
-        v.foreach { item =>
-          val count = item.size
-          println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n")
-        }
+  def printAsText(result: ReportResult): Unit = {
+    val ReportResult(duplicates, similarities) = result
+
+    if (duplicates.empty()) {
+      println(s"No duplicated files found.")
+    } else {
+      duplicates.v.foreach { item =>
+        val count = item.size
+        println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n")
+      }
     }
-  }
 
-  def printCommunities(report: Iterable[Iterable[SimilarItem]]): Unit = {
-    if (report.isEmpty) {
+    if (similarities.isEmpty) {
       println(s"No similarities found.")
     } else {
-      report.foreach { community =>
+      similarities.foreach { community =>
         val count = community.size
         val typeName = community.head match {
           case SimilarFunc(_, _, _) => "functions"
@@ -103,4 +119,18 @@ object ReportApp extends App {
       }
     }
   }
+
+  def printAsJson(result: ReportResult): Unit = {
+    val ReportResult(duplicates, similarities) = result
+
+    val mapper = new ObjectMapper() with ScalaObjectMapper
+    mapper.registerModule(DefaultScalaModule)
+
+    val str = mapper.writeValueAsString(Map(
+      "duplicates" -> duplicates.v,
+      "similarities" -> similarities
+    ))
+    println(str)
+  }
+
 }
diff --git a/src/test/scala/tech/sourced/gemini/ReportSpec.scala b/src/test/scala/tech/sourced/gemini/ReportSpec.scala
@@ -65,17 +65,6 @@ class ReportSpec extends FlatSpec
     super.afterAll()
   }
 
-  "Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in {
-    val report = new Report(session, logger, DUPLICATES, Gemini.tables)
-
-    println("Query")
-    val result = report.reportCassandraCondensed()
-    println("Done")
-
-    result should have size expectedDuplicateFiles.size
-    result foreach (_.count should be(2))
-  }
-
   "Detailed Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in {
     val report = new Report(session, logger, DUPLICATES, Gemini.tables)