stacklok · lukehinds · Nov 25, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/.github/workflows/import_packages.yml b/.github/workflows/import_packages.yml
@@ -0,0 +1,52 @@
+name: Sync vector DB
+
+on:
+  workflow_dispatch:
+
+jobs:
+  # This workflow contains a single job called "greet"
+  sync_db:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    - uses: actions/checkout@v3    
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install "."
+
+    - name: Install GitHub CLI
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y gh
+
+    - name: Fetch latest successful workflow run ID
+      id: get-run-id
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      run: |
+        workflow_id=".github/workflows/import_packages.yml"
+        run_id=$(gh api --paginate repos/${{ github.repository }}/actions/runs --jq ".workflow_runs[] | select(.name == \"$workflow_id\" and .conclusion == \"success\") | .id" | head -n 1)
+        echo "::set-output name=run_id::$run_id"
+
+    - name: Download the latest artifact
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      run: |
+        gh run download ${{ steps.get-run-id.outputs.run_id }}    
+
+    - name: Run sync
+      run: |
+        export PYTHONPATH=$PYTHONPATH:./
+        python scripts/import_packages.py
+    - name: 'Upload Volume'
+      uses: actions/upload-artifact@v4
+      with:
+        name: database_volume
+        path: weaviate_data
+        retention-days: 5
diff --git a/data/archived.jsonl b/data/archived.jsonl
diff --git a/data/deprecated.jsonl b/data/deprecated.jsonl
diff --git a/data/malicious.jsonl b/data/malicious.jsonl
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,13 +5,18 @@ description = "Generative AI CodeGen security gateway"
 readme = "README.md"
 authors = []
 packages = [{include = "codegate", from = "src"}]
+requires-python = ">=3.11"
 
 [tool.poetry.dependencies]
 python = ">=3.11"
 click = ">=8.1.0"
 PyYAML = ">=6.0.1"
 fastapi = ">=0.115.5"
 uvicorn = ">=0.32.1"
+weaviate = ">=0.1.2"
+weaviate-client = ">=4.9.3"
+torch = ">=2.5.1"
+transformers = ">=4.46.3"
 
 litellm = "^1.52.15"
 [tool.poetry.group.dev.dependencies]

diff --git a/scripts/import_packages.py b/scripts/import_packages.py
@@ -0,0 +1,120 @@
+import json
+from utils.embedding_util import generate_embeddings
+import weaviate
+from weaviate.embedded import EmbeddedOptions
+from weaviate.classes.config import Property, DataType
+
+
+json_files = [
+    'data/archived.jsonl',
+    'data/deprecated.jsonl',
+    'data/malicious.jsonl',
+]
+
+
+def setup_schema(client):
+    if not client.collections.exists("Package"):
+        client.collections.create(
+            "Package",
+            properties=[
+                Property(name="name", data_type=DataType.TEXT),
+                Property(name="type", data_type=DataType.TEXT),
+                Property(name="status", data_type=DataType.TEXT),
+                Property(name="description", data_type=DataType.TEXT),
+            ]
+        )
+
+
+def generate_vector_string(package):
+    vector_str = f"{package['name']}"
+    # add description
+    package_url = ""
+    if package["type"] == "pypi":
+        vector_str += " is a Python package available on PyPI"
+        package_url = f"https://trustypkg.dev/pypi/{package['name']}"
+    elif package["type"] == "npm":
+        vector_str += " is a JavaScript package available on NPM"
+        package_url = f"https://trustypkg.dev/npm/{package['name']}"
+    elif package["type"] == "go":
+        vector_str += " is a Go package. "
+        package_url = f"https://trustypkg.dev/go/{package['name']}"
+    elif package["type"] == "crates":
+        vector_str += " is a Rust package available on Crates. "
+        package_url = f"https://trustypkg.dev/crates/{package['name']}"
+    elif package["type"] == "java":
+        vector_str += " is a Java package. "
+        package_url = f"https://trustypkg.dev/java/{package['name']}"
+
+    # add extra status
+    if package["status"] == "archived":
+        vector_str += f". However, this package is found to be archived and no longer maintained. For additional information refer to {package_url}"
+    elif package["status"] == "deprecated":
+        vector_str += f". However, this package is found to be deprecated and no longer recommended for use. For additional information refer to {package_url}"
+    elif package["status"] == "malicious":
+        vector_str += f". However, this package is found to be malicious. For additional information refer to {package_url}"
+    return vector_str
+
+
+def add_data(client):
+    collection = client.collections.get("Package")
+
+    # read all the data from db, we will only add if there is no data, or is different
+    existing_packages = list(collection.iterator())
+    packages_dict = {}
+    for package in existing_packages:
+        key = package.properties['name']+"/"+package.properties['type']
+        value = {
+            'status': package.properties['status'],
+            'description': package.properties['description'],
+        }
+        packages_dict[key] = value
+
+    for json_file in json_files:
+        with open(json_file, 'r') as f:
+            print("Adding data from", json_file)
+            with collection.batch.dynamic() as batch:
+                for line in f:
+                    package = json.loads(line)
+
+                    # now add the status column
+                    if 'archived' in json_file:
+                        package['status'] = 'archived'
+                    elif 'deprecated' in json_file:
+                        package['status'] = 'deprecated'
+                    elif 'malicious' in json_file:
+                        package['status'] = 'malicious'
+                    else:
+                        package['status'] = 'unknown'
+
+                    # check for the existing package and only add if different
+                    key = package['name']+"/"+package['type']
+                    if key in packages_dict:
+                        if packages_dict[key]['status'] == package['status'] and packages_dict[key]['description'] == package['description']:
+                            print("Package already exists", key)
+                            continue
+
+                    # prepare the object for embedding
+                    print("Generating data for", key)
+                    vector_str = generate_vector_string(package)
+                    vector = generate_embeddings(vector_str)
+
+                    batch.add_object(properties=package, vector=vector)
+
+
+def run_import():
+    client = weaviate.WeaviateClient(
+        embedded_options=EmbeddedOptions(
+            persistence_data_path="./weaviate_data",
+            grpc_port=50052
+        ),
+    )
+    with client:
+        client.connect()
+        print('is_ready:', client.is_ready())
+
+        setup_schema(client)
+        add_data(client)
+
+
+if __name__ == '__main__':
+    run_import()
diff --git a/utils/__init__.py b/utils/__init__.py
diff --git a/utils/embedding_util.py b/utils/embedding_util.py
@@ -0,0 +1,40 @@
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+import os
+import warnings
+
+# The transformers library internally is creating this warning, but does not
+# impact our app. Safe to ignore.
+warnings.filterwarnings(action='ignore', category=ResourceWarning)
+
+
+# We won't have competing threads in this example app
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+# Initialize tokenizer and model for GTE-base
+tokenizer = AutoTokenizer.from_pretrained('thenlper/gte-base')
+model = AutoModel.from_pretrained('thenlper/gte-base')
+
+
+def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(
+        ~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+
+def generate_embeddings(text):
+    inputs = tokenizer(text, return_tensors='pt',
+                       max_length=512, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    attention_mask = inputs['attention_mask']
+    embeddings = average_pool(outputs.last_hidden_state, attention_mask)
+
+    # (Optionally) normalize embeddings
+    embeddings = F.normalize(embeddings, p=2, dim=1)
+
+    return embeddings.numpy().tolist()[0]