Merge pull request #183 from smacker/add_aws_s3_support

smacker · web-flow · commit 110d050bf9f6 · 2019-01-30T15:35:22.000+01:00
add support for Amazon Web Services S3
diff --git a/build.sbt b/build.sbt
@@ -23,6 +23,7 @@ libraryDependencies ++= Seq(
   engine % Compile,
   jgit % Compile,
   gcs % Compile,
+  hadoopAws % Compile,
   fixNetty,
   cassandraDriverMetrics % Provided, //needed for using Driver \wo Spark from SparkConnector
   cassandraSparkConnector % Compile,
@@ -34,7 +35,6 @@ libraryDependencies ++= Seq(
   bblfshClient % Compile,
   avro % Compile,
   parquetAvro % Compile,
-  hadoopCommon % Compile,
   scalaJsonParser % Compile //needed for docFreq reading
   //TODO(bzz): remove scalaJsonParser at https://github.com/src-d/gemini/issues/112
 )
@@ -44,6 +44,7 @@ assemblyJarName in assemblyPackageDependency := s"${name.value}-deps.jar"
 
 assemblyMergeStrategy in assembly := {
   case "META-INF/io.netty.versions.properties" => MergeStrategy.last
+  case "mime.types" => MergeStrategy.last
   case x =>
     val oldStrategy = (assemblyMergeStrategy in assembly).value
     oldStrategy(x)
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -18,10 +18,12 @@ object Dependencies {
   lazy val log4jBinding = "org.slf4j" % "slf4j-log4j12" % "1.7.25"
   lazy val avro = "org.apache.avro" % "avro" % "1.7.7"
   lazy val parquetAvro = "org.apache.parquet" % "parquet-avro" % "1.9.0"
-  // same version as in spark and exclude conflict dependencies with cassandra driver
-  // similar to:
-  // https://github.com/datastax/spark-cassandra-connector/blob/v2.0.6/project/SparkCassandraConnectorBuild.scala#L184
-  lazy val hadoopCommon = ("org.apache.hadoop" % "hadoop-common" % "2.6.5")
+  // compatible version with spark 2.2
+  // hadoop-aws will bring hadoop-common package with it which is needed for gemini itself
+  lazy val hadoopAws = ("org.apache.hadoop" % "hadoop-aws" % "2.7.2")
+    // excludes conflict dependencies with cassandra driver
+    // similar to:
+    // https://github.com/datastax/spark-cassandra-connector/blob/v2.0.6/project/SparkCassandraConnectorBuild.scala#L184
     .exclude("com.sun.jersey", "jersey-server")
     .exclude("commons-beanutils", "commons-beanutils-core")
   lazy val scalapb = "com.thesamet.scalapb" %% "scalapb-runtime" % "0.8.4"
diff --git a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala
@@ -25,8 +25,11 @@ case class HashAppConfig(
   docFreqFile: String = "",
   verbose: Boolean = false,
   mode: String = Gemini.fileSimilarityMode,
+  replace: Boolean = false,
   gcsKeyFile: String = "",
-  replace: Boolean = false
+  awsKey: String = "",
+  awsSecret: String = "",
+  awsS3Endpoint: String = ""
 )
 
 /**
@@ -96,12 +99,21 @@ object HashSparkApp extends App with Logging {
     opt[String]("doc-freq-file")
       .action((x, c) => c.copy(docFreqFile = x))
       .text("path to file with feature frequencies")
-    opt[String]("gcs-keyfile")
-      .action((x, c) => c.copy(gcsKeyFile = x))
-      .text("path to JSON keyfile for authentication in Google Cloud Storage")
     opt[Unit]("replace")
       .action((x, c) => c.copy(replace = true))
       .text("replace results of previous hashing")
+    opt[String]("gcs-keyfile")
+      .action((x, c) => c.copy(gcsKeyFile = x))
+      .text("path to JSON keyfile for authentication in Google Cloud Storage")
+    opt[String]("aws-key")
+      .action((x, c) => c.copy(awsKey = x))
+      .text("AWS access keys")
+    opt[String]("aws-secret")
+      .action((x, c) => c.copy(awsSecret = x))
+      .text("AWS access secret")
+    opt[String]("aws-s3-endpoint")
+      .action((x, c) => c.copy(awsS3Endpoint = x))
+      .text("region S3 endpoint")
     arg[String]("<path-to-git-repos>")
       .required()
       .action((x, c) => c.copy(reposPath = x))
@@ -132,6 +144,28 @@ object HashSparkApp extends App with Logging {
         spark.sparkContext.hadoopConfiguration.set("google.cloud.auth.service.account.json.keyfile", config.gcsKeyFile)
       }
 
+      // AWS S3 combo
+      // The problem is we use old version of spark&hadoop which has 4 issues:
+      // 1. It brings as dependency old amazon-aws package
+      // which requires separate flag to enable support for current aws protocol
+      spark.sparkContext.hadoopConfiguration.set("com.amazonaws.services.s3.enableV4", "true")
+      // 2. Only NativeS3FileSystem works correctly with new protocol
+      spark.sparkContext.hadoopConfiguration.set("fs.s3a.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
+      // 3. The client is configured to use the default S3A service endpoint
+      // but for v4 protocol it must be set to the region endpoint bucket belongs to
+      if (config.awsS3Endpoint.nonEmpty) {
+        spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", config.awsS3Endpoint)
+      }
+      // 4. Glob (from jgit-spark-connector) with key&secret in URL isn't supported by current version
+      // $ ./hash s3a://key:token@bucket/repos/
+      // Error: "Wrong FS: s3a://key:token@bucket/repos/*, expected: s3a://key:token@bucket"
+      if (config.awsKey.nonEmpty) {
+        spark.sparkContext.hadoopConfiguration.set("fs.s3a.awsAccessKeyId", config.awsKey)
+      }
+      if (config.awsSecret.nonEmpty) {
+        spark.sparkContext.hadoopConfiguration.set("fs.s3a.awsSecretAccessKey", config.awsSecret)
+      }
+
       val reposPath = config.reposPath
       val repos = listRepositories(reposPath, config.format, spark, config.limit)
       printRepositories(reposPath, repos)