@@ -25,8 +25,11 @@ case class HashAppConfig(
2525 docFreqFile : String = " " ,
2626 verbose : Boolean = false ,
2727 mode : String = Gemini .fileSimilarityMode,
28+ replace : Boolean = false ,
2829 gcsKeyFile : String = " " ,
29- replace : Boolean = false
30+ awsKey : String = " " ,
31+ awsSecret : String = " " ,
32+ awsS3Endpoint : String = " "
3033)
3134
3235/**
@@ -96,12 +99,21 @@ object HashSparkApp extends App with Logging {
9699 opt[String ](" doc-freq-file" )
97100 .action((x, c) => c.copy(docFreqFile = x))
98101 .text(" path to file with feature frequencies" )
99- opt[String ](" gcs-keyfile" )
100- .action((x, c) => c.copy(gcsKeyFile = x))
101- .text(" path to JSON keyfile for authentication in Google Cloud Storage" )
102102 opt[Unit ](" replace" )
103103 .action((x, c) => c.copy(replace = true ))
104104 .text(" replace results of previous hashing" )
105+ opt[String ](" gcs-keyfile" )
106+ .action((x, c) => c.copy(gcsKeyFile = x))
107+ .text(" path to JSON keyfile for authentication in Google Cloud Storage" )
108+ opt[String ](" aws-key" )
109+ .action((x, c) => c.copy(awsKey = x))
110+ .text(" AWS access keys" )
111+ opt[String ](" aws-secret" )
112+ .action((x, c) => c.copy(awsSecret = x))
113+ .text(" AWS access secret" )
114+ opt[String ](" aws-s3-endpoint" )
115+ .action((x, c) => c.copy(awsS3Endpoint = x))
116+ .text(" region S3 endpoint" )
105117 arg[String ](" <path-to-git-repos>" )
106118 .required()
107119 .action((x, c) => c.copy(reposPath = x))
@@ -132,6 +144,28 @@ object HashSparkApp extends App with Logging {
132144 spark.sparkContext.hadoopConfiguration.set(" google.cloud.auth.service.account.json.keyfile" , config.gcsKeyFile)
133145 }
134146
147+ // AWS S3 combo
148+ // The problem is we use old version of spark&hadoop which has 4 issues:
149+ // 1. It brings as dependency old amazon-aws package
150+ // which requires separate flag to enable support for current aws protocol
151+ spark.sparkContext.hadoopConfiguration.set(" com.amazonaws.services.s3.enableV4" , " true" )
152+ // 2. Only NativeS3FileSystem works correctly with new protocol
153+ spark.sparkContext.hadoopConfiguration.set(" fs.s3a.impl" , " org.apache.hadoop.fs.s3native.NativeS3FileSystem" )
154+ // 3. The client is configured to use the default S3A service endpoint
155+ // but for v4 protocol it must be set to the region endpoint bucket belongs to
156+ if (config.awsS3Endpoint.nonEmpty) {
157+ spark.sparkContext.hadoopConfiguration.set(" fs.s3a.endpoint" , config.awsS3Endpoint)
158+ }
159+ // 4. Glob (from jgit-spark-connector) with key&secret in URL isn't supported by current version
160+ // $ ./hash s3a://key:token@bucket/repos/
161+ // Error: "Wrong FS: s3a://key:token@bucket/repos/*, expected: s3a://key:token@bucket"
162+ if (config.awsKey.nonEmpty) {
163+ spark.sparkContext.hadoopConfiguration.set(" fs.s3a.awsAccessKeyId" , config.awsKey)
164+ }
165+ if (config.awsSecret.nonEmpty) {
166+ spark.sparkContext.hadoopConfiguration.set(" fs.s3a.awsSecretAccessKey" , config.awsSecret)
167+ }
168+
135169 val reposPath = config.reposPath
136170 val repos = listRepositories(reposPath, config.format, spark, config.limit)
137171 printRepositories(reposPath, repos)
0 commit comments