diff --git a/docker/foo.bin b/docker/foo.bin new file mode 100644 index 000000000..97c4296cd Binary files /dev/null and b/docker/foo.bin differ diff --git a/docker/start_db.sh b/docker/start_db.sh index a322aa97c..f5a946dbf 100755 --- a/docker/start_db.sh +++ b/docker/start_db.sh @@ -101,6 +101,13 @@ for a in ${COORDINATORS[*]} ; do curl -u root:test --insecure --fail "$SCHEME://$a/_api/version" done +echo "" +echo "" +echo "Copying test ML models into containers..." +for c in $(docker ps -a -f name=adb-.* -q) ; do + docker cp "$LOCATION"/foo.bin "$c":/tmp +done + echo "" echo "" echo "Done, your deployment is reachable at: " diff --git a/src/main/java/com/arangodb/entity/InvertedIndexField.java b/src/main/java/com/arangodb/entity/InvertedIndexField.java index 92ee15cb1..016fb2c39 100644 --- a/src/main/java/com/arangodb/entity/InvertedIndexField.java +++ b/src/main/java/com/arangodb/entity/InvertedIndexField.java @@ -18,7 +18,7 @@ public class InvertedIndexField implements Entity { private Boolean searchField; private Boolean trackListPositions; private final Set features = new HashSet<>(); - private final Collection nested = new ArrayList<>(); + private Collection nested; public String getName() { return name; @@ -79,6 +79,7 @@ public Collection getNested() { } public InvertedIndexField nested(InvertedIndexField... nested) { + if(this.nested == null) this.nested = new ArrayList<>(); Collections.addAll(this.nested, nested); return this; } diff --git a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java index 7a5ea5918..acf38797d 100644 --- a/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java +++ b/src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java @@ -24,5 +24,20 @@ * @author Michele Rastelli */ public enum AnalyzerType { - identity, delimiter, stem, norm, ngram, text, pipeline, stopwords, aql, geojson, geopoint, segmentation, collation + identity, + delimiter, + stem, + norm, + ngram, + text, + pipeline, + stopwords, + aql, + geojson, + geopoint, + segmentation, + collation, + classification, + nearest_neighbors, + minhash } diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java new file mode 100644 index 000000000..55eb3a47a --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzer.java @@ -0,0 +1,64 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer capable of classifying tokens in the input text. It applies a user-provided supervised fastText word + * embedding model to classify the input text. It is able to classify individual tokens as well as entire inputs. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.10 + */ +public class ClassificationAnalyzer extends SearchAnalyzer { + public ClassificationAnalyzer() { + setType(AnalyzerType.classification); + } + + private ClassificationAnalyzerProperties properties; + + public ClassificationAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(ClassificationAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + ClassificationAnalyzer that = (ClassificationAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java new file mode 100644 index 000000000..76092580e --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/ClassificationAnalyzerProperties.java @@ -0,0 +1,78 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.velocypack.annotations.SerializedName; + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.10 + */ +public class ClassificationAnalyzerProperties { + + @SerializedName("model_location") + private String modelLocation; + + @SerializedName("top_k") + private Integer topK; + + private Double threshold; + + public String getModelLocation() { + return modelLocation; + } + + public void setModelLocation(String modelLocation) { + this.modelLocation = modelLocation; + } + + public Integer getTopK() { + return topK; + } + + public void setTopK(Integer topK) { + this.topK = topK; + } + + public Double getThreshold() { + return threshold; + } + + public void setThreshold(Double threshold) { + this.threshold = threshold; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ClassificationAnalyzerProperties that = (ClassificationAnalyzerProperties) o; + return Objects.equals(modelLocation, that.modelLocation) && Objects.equals(topK, that.topK) && Objects.equals(threshold, that.threshold); + } + + @Override + public int hashCode() { + return Objects.hash(modelLocation, topK, threshold); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java new file mode 100644 index 000000000..116103e52 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzer.java @@ -0,0 +1,64 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer that computes so called MinHash signatures using a locality-sensitive hash function. It applies an + * Analyzer of your choice before the hashing, for example, to break up text into words. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.10 + */ +public class MinHashAnalyzer extends SearchAnalyzer { + public MinHashAnalyzer() { + setType(AnalyzerType.minhash); + } + + private MinHashAnalyzerProperties properties; + + public MinHashAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(MinHashAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + MinHashAnalyzer that = (MinHashAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java new file mode 100644 index 000000000..a451c0525 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/MinHashAnalyzerProperties.java @@ -0,0 +1,63 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.10 + */ +public class MinHashAnalyzerProperties { + + private SearchAnalyzer analyzer; + private Integer numHashes; + + public SearchAnalyzer getAnalyzer() { + return analyzer; + } + + public void setAnalyzer(SearchAnalyzer analyzer) { + this.analyzer = analyzer; + } + + public Integer getNumHashes() { + return numHashes; + } + + public void setNumHashes(Integer numHashes) { + this.numHashes = numHashes; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + MinHashAnalyzerProperties that = (MinHashAnalyzerProperties) o; + return Objects.equals(analyzer, that.analyzer) && Objects.equals(numHashes, that.numHashes); + } + + @Override + public int hashCode() { + return Objects.hash(analyzer, numHashes); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java new file mode 100644 index 000000000..c8641db4f --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzer.java @@ -0,0 +1,66 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.entity.arangosearch.AnalyzerType; + +import java.util.Objects; + +/** + * An Analyzer capable of finding nearest neighbors of tokens in the input. It applies a user-provided supervised + * fastText word embedding model to retrieve nearest neighbor tokens in the text. It is able to find neighbors of + * individual tokens as well as entire input strings. For entire input strings, the Analyzer will return nearest + * neighbors for each token within the input string. + * + * @author Michele Rastelli + * @see API Documentation + * @since ArangoDB 3.10 + */ +public class NearestNeighborsAnalyzer extends SearchAnalyzer { + public NearestNeighborsAnalyzer() { + setType(AnalyzerType.nearest_neighbors); + } + + private NearestNeighborsAnalyzerProperties properties; + + public NearestNeighborsAnalyzerProperties getProperties() { + return properties; + } + + public void setProperties(NearestNeighborsAnalyzerProperties properties) { + this.properties = properties; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + NearestNeighborsAnalyzer that = (NearestNeighborsAnalyzer) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), properties); + } +} diff --git a/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java new file mode 100644 index 000000000..42335b299 --- /dev/null +++ b/src/main/java/com/arangodb/entity/arangosearch/analyzer/NearestNeighborsAnalyzerProperties.java @@ -0,0 +1,69 @@ +/* + * DISCLAIMER + * + * Copyright 2016 ArangoDB GmbH, Cologne, Germany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright holder is ArangoDB GmbH, Cologne, Germany + */ + +package com.arangodb.entity.arangosearch.analyzer; + + +import com.arangodb.velocypack.annotations.SerializedName; + +import java.util.Objects; + +/** + * @author Michele Rastelli + * @since ArangoDB 3.10 + */ +public class NearestNeighborsAnalyzerProperties { + + @SerializedName("model_location") + private String modelLocation; + + @SerializedName("top_k") + private Integer topK; + + + public String getModelLocation() { + return modelLocation; + } + + public void setModelLocation(String modelLocation) { + this.modelLocation = modelLocation; + } + + public Integer getTopK() { + return topK; + } + + public void setTopK(Integer topK) { + this.topK = topK; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NearestNeighborsAnalyzerProperties that = (NearestNeighborsAnalyzerProperties) o; + return Objects.equals(modelLocation, that.modelLocation) && Objects.equals(topK, that.topK); + } + + @Override + public int hashCode() { + return Objects.hash(modelLocation, topK); + } +} diff --git a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java index 9178657ce..5c72b8e0e 100644 --- a/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java +++ b/src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java @@ -102,6 +102,12 @@ public class VPackDeserializers { return context.deserialize(vpack, SegmentationAnalyzer.class); case collation: return context.deserialize(vpack, CollationAnalyzer.class); + case classification: + return context.deserialize(vpack, ClassificationAnalyzer.class); + case nearest_neighbors: + return context.deserialize(vpack, NearestNeighborsAnalyzer.class); + case minhash: + return context.deserialize(vpack, MinHashAnalyzer.class); default: throw new IllegalArgumentException("Unknown analyzer type: " + type); } diff --git a/src/test/java/com/arangodb/ArangoSearchTest.java b/src/test/java/com/arangodb/ArangoSearchTest.java index 22494fb02..3a3d66ce8 100644 --- a/src/test/java/com/arangodb/ArangoSearchTest.java +++ b/src/test/java/com/arangodb/ArangoSearchTest.java @@ -987,6 +987,83 @@ void collationAnalyzer(ArangoDatabase db) { createGetAndDeleteTypedAnalyzer(db, collationAnalyzer); } + @ParameterizedTest(name = "{index}") + @MethodSource("dbs") + void classificationAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 10)); + assumeTrue(isEnterprise()); + + ClassificationAnalyzerProperties properties = new ClassificationAnalyzerProperties(); + properties.setModelLocation("/tmp/foo.bin"); + properties.setTopK(2); + properties.setThreshold(.5); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + ClassificationAnalyzer analyzer = new ClassificationAnalyzer(); + analyzer.setName("test-" + UUID.randomUUID()); + analyzer.setProperties(properties); + analyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, analyzer); + } + + @ParameterizedTest(name = "{index}") + @MethodSource("dbs") + void nearestNeighborsAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 10)); + assumeTrue(isEnterprise()); + + NearestNeighborsAnalyzerProperties properties = new NearestNeighborsAnalyzerProperties(); + properties.setModelLocation("/tmp/foo.bin"); + properties.setTopK(2); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + NearestNeighborsAnalyzer analyzer = new NearestNeighborsAnalyzer(); + analyzer.setName("test-" + UUID.randomUUID()); + analyzer.setProperties(properties); + analyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, analyzer); + } + + @ParameterizedTest(name = "{index}") + @MethodSource("dbs") + void MinHashAnalyzer(ArangoDatabase db) { + assumeTrue(isAtLeastVersion(3, 10)); + assumeTrue(isEnterprise()); + + SegmentationAnalyzerProperties segProperties = new SegmentationAnalyzerProperties(); + segProperties.setBreakMode(SegmentationAnalyzerProperties.BreakMode.alpha); + segProperties.setAnalyzerCase(SearchAnalyzerCase.lower); + + SegmentationAnalyzer segAnalyzer = new SegmentationAnalyzer(); + segAnalyzer.setProperties(segProperties); + + MinHashAnalyzerProperties properties = new MinHashAnalyzerProperties(); + properties.setAnalyzer(segAnalyzer); + properties.setNumHashes(2); + + Set features = new HashSet<>(); + features.add(AnalyzerFeature.frequency); + features.add(AnalyzerFeature.norm); + features.add(AnalyzerFeature.position); + + MinHashAnalyzer analyzer = new MinHashAnalyzer(); + analyzer.setName("test-" + UUID.randomUUID()); + analyzer.setProperties(properties); + analyzer.setFeatures(features); + + createGetAndDeleteTypedAnalyzer(db, analyzer); + } + @ParameterizedTest(name = "{index}") @MethodSource("dbs") void offsetFeature(ArangoDatabase db) { diff --git a/src/test/resources/logback-test.xml b/src/test/resources/logback-test.xml index 579f1b9db..f67855e9c 100644 --- a/src/test/resources/logback-test.xml +++ b/src/test/resources/logback-test.xml @@ -8,7 +8,7 @@ - +