Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ libraryDependencies ++= Seq(
engine % Compile,
jgit % Compile,
gcs % Compile,
hadoopAws % Compile,
fixNetty,
cassandraDriverMetrics % Provided, //needed for using Driver \wo Spark from SparkConnector
cassandraSparkConnector % Compile,
Expand All @@ -34,7 +35,6 @@ libraryDependencies ++= Seq(
bblfshClient % Compile,
avro % Compile,
parquetAvro % Compile,
hadoopCommon % Compile,
scalaJsonParser % Compile //needed for docFreq reading
//TODO(bzz): remove scalaJsonParser at https://github.com/src-d/gemini/issues/112
)
Expand All @@ -44,6 +44,7 @@ assemblyJarName in assemblyPackageDependency := s"${name.value}-deps.jar"

assemblyMergeStrategy in assembly := {
case "META-INF/io.netty.versions.properties" => MergeStrategy.last
case "mime.types" => MergeStrategy.last
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
Expand Down
10 changes: 6 additions & 4 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ object Dependencies {
lazy val log4jBinding = "org.slf4j" % "slf4j-log4j12" % "1.7.25"
lazy val avro = "org.apache.avro" % "avro" % "1.7.7"
lazy val parquetAvro = "org.apache.parquet" % "parquet-avro" % "1.9.0"
// same version as in spark and exclude conflict dependencies with cassandra driver
// similar to:
// https://github.com/datastax/spark-cassandra-connector/blob/v2.0.6/project/SparkCassandraConnectorBuild.scala#L184
lazy val hadoopCommon = ("org.apache.hadoop" % "hadoop-common" % "2.6.5")
// compatible version with spark 2.2
// hadoop-aws will bring hadoop-common package with it which is needed for gemini itself
lazy val hadoopAws = ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
// excludes conflict dependencies with cassandra driver
// similar to:
// https://github.com/datastax/spark-cassandra-connector/blob/v2.0.6/project/SparkCassandraConnectorBuild.scala#L184
.exclude("com.sun.jersey", "jersey-server")
.exclude("commons-beanutils", "commons-beanutils-core")
lazy val scalapb = "com.thesamet.scalapb" %% "scalapb-runtime" % "0.8.4"
Expand Down
42 changes: 38 additions & 4 deletions src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ case class HashAppConfig(
docFreqFile: String = "",
verbose: Boolean = false,
mode: String = Gemini.fileSimilarityMode,
replace: Boolean = false,
gcsKeyFile: String = "",
replace: Boolean = false
awsKey: String = "",
awsSecret: String = "",
awsS3Endpoint: String = ""
)

/**
Expand Down Expand Up @@ -96,12 +99,21 @@ object HashSparkApp extends App with Logging {
opt[String]("doc-freq-file")
.action((x, c) => c.copy(docFreqFile = x))
.text("path to file with feature frequencies")
opt[String]("gcs-keyfile")
.action((x, c) => c.copy(gcsKeyFile = x))
.text("path to JSON keyfile for authentication in Google Cloud Storage")
opt[Unit]("replace")
.action((x, c) => c.copy(replace = true))
.text("replace results of previous hashing")
opt[String]("gcs-keyfile")
.action((x, c) => c.copy(gcsKeyFile = x))
.text("path to JSON keyfile for authentication in Google Cloud Storage")
opt[String]("aws-key")
.action((x, c) => c.copy(awsKey = x))
.text("AWS access keys")
opt[String]("aws-secret")
.action((x, c) => c.copy(awsSecret = x))
.text("AWS access secret")
opt[String]("aws-s3-endpoint")
.action((x, c) => c.copy(awsS3Endpoint = x))
.text("region S3 endpoint")
arg[String]("<path-to-git-repos>")
.required()
.action((x, c) => c.copy(reposPath = x))
Expand Down Expand Up @@ -132,6 +144,28 @@ object HashSparkApp extends App with Logging {
spark.sparkContext.hadoopConfiguration.set("google.cloud.auth.service.account.json.keyfile", config.gcsKeyFile)
}

// AWS S3 combo
// The problem is we use old version of spark&hadoop which has 4 issues:
// 1. It brings as dependency old amazon-aws package
// which requires separate flag to enable support for current aws protocol
spark.sparkContext.hadoopConfiguration.set("com.amazonaws.services.s3.enableV4", "true")
// 2. Only NativeS3FileSystem works correctly with new protocol
spark.sparkContext.hadoopConfiguration.set("fs.s3a.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
// 3. The client is configured to use the default S3A service endpoint
// but for v4 protocol it must be set to the region endpoint bucket belongs to
if (config.awsS3Endpoint.nonEmpty) {
spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", config.awsS3Endpoint)
}
// 4. Glob (from jgit-spark-connector) with key&secret in URL isn't supported by current version
// $ ./hash s3a://key:token@bucket/repos/
// Error: "Wrong FS: s3a://key:token@bucket/repos/*, expected: s3a://key:token@bucket"
if (config.awsKey.nonEmpty) {
spark.sparkContext.hadoopConfiguration.set("fs.s3a.awsAccessKeyId", config.awsKey)
}
if (config.awsSecret.nonEmpty) {
spark.sparkContext.hadoopConfiguration.set("fs.s3a.awsSecretAccessKey", config.awsSecret)
}

val reposPath = config.reposPath
val repos = listRepositories(reposPath, config.format, spark, config.limit)
printRepositories(reposPath, repos)
Expand Down