Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Create github action for syncing and exporting vector DB #69

Merged
merged 11 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions .github/workflows/import_packages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Sync vector DB

on:
workflow_dispatch:

jobs:
# This workflow contains a single job called "greet"
sync_db:
# The type of runner that the job will run on
runs-on: ubuntu-latest

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "."

- name: Install GitHub CLI
run: |
sudo apt-get update
sudo apt-get install -y gh

- name: Fetch latest successful workflow run ID
id: get-run-id
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
workflow_id=".github/workflows/import_packages.yml"
run_id=$(gh api --paginate repos/${{ github.repository }}/actions/runs --jq ".workflow_runs[] | select(.name == \"$workflow_id\" and .conclusion == \"success\") | .id" | head -n 1)
echo "::set-output name=run_id::$run_id"

- name: Download the latest artifact
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
gh run download ${{ steps.get-run-id.outputs.run_id }}

- name: Run sync
run: |
export PYTHONPATH=$PYTHONPATH:./
python scripts/import_packages.py
- name: 'Upload Volume'
uses: actions/upload-artifact@v4
with:
name: database_volume
path: weaviate_data
retention-days: 5
9,309 changes: 9,309 additions & 0 deletions data/archived.jsonl

Large diffs are not rendered by default.

31,572 changes: 31,572 additions & 0 deletions data/deprecated.jsonl

Large diffs are not rendered by default.

25,480 changes: 25,480 additions & 0 deletions data/malicious.jsonl

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@ description = "Generative AI CodeGen security gateway"
readme = "README.md"
authors = []
packages = [{include = "codegate", from = "src"}]
requires-python = ">=3.11"

[tool.poetry.dependencies]
python = ">=3.11"
click = ">=8.1.0"
PyYAML = ">=6.0.1"
fastapi = ">=0.115.5"
uvicorn = ">=0.32.1"
weaviate = ">=0.1.2"
weaviate-client = ">=4.9.3"
torch = ">=2.5.1"
transformers = ">=4.46.3"

litellm = "^1.52.15"
[tool.poetry.group.dev.dependencies]
Expand Down
120 changes: 120 additions & 0 deletions scripts/import_packages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import json
from utils.embedding_util import generate_embeddings
import weaviate
from weaviate.embedded import EmbeddedOptions
from weaviate.classes.config import Property, DataType


json_files = [
'data/archived.jsonl',
'data/deprecated.jsonl',
'data/malicious.jsonl',
]


def setup_schema(client):
if not client.collections.exists("Package"):
client.collections.create(
"Package",
properties=[
Property(name="name", data_type=DataType.TEXT),
Property(name="type", data_type=DataType.TEXT),
Property(name="status", data_type=DataType.TEXT),
Property(name="description", data_type=DataType.TEXT),
]
)


def generate_vector_string(package):
vector_str = f"{package['name']}"
# add description
package_url = ""
if package["type"] == "pypi":
vector_str += " is a Python package available on PyPI"
package_url = f"https://trustypkg.dev/pypi/{package['name']}"
elif package["type"] == "npm":
vector_str += " is a JavaScript package available on NPM"
package_url = f"https://trustypkg.dev/npm/{package['name']}"
elif package["type"] == "go":
vector_str += " is a Go package. "
package_url = f"https://trustypkg.dev/go/{package['name']}"
elif package["type"] == "crates":
vector_str += " is a Rust package available on Crates. "
package_url = f"https://trustypkg.dev/crates/{package['name']}"
elif package["type"] == "java":
vector_str += " is a Java package. "
package_url = f"https://trustypkg.dev/java/{package['name']}"

# add extra status
if package["status"] == "archived":
vector_str += f". However, this package is found to be archived and no longer maintained. For additional information refer to {package_url}"
elif package["status"] == "deprecated":
vector_str += f". However, this package is found to be deprecated and no longer recommended for use. For additional information refer to {package_url}"
elif package["status"] == "malicious":
vector_str += f". However, this package is found to be malicious. For additional information refer to {package_url}"
return vector_str


def add_data(client):
collection = client.collections.get("Package")

# read all the data from db, we will only add if there is no data, or is different
existing_packages = list(collection.iterator())
packages_dict = {}
for package in existing_packages:
key = package.properties['name']+"/"+package.properties['type']
value = {
'status': package.properties['status'],
'description': package.properties['description'],
}
packages_dict[key] = value

for json_file in json_files:
with open(json_file, 'r') as f:
print("Adding data from", json_file)
with collection.batch.dynamic() as batch:
for line in f:
package = json.loads(line)

# now add the status column
if 'archived' in json_file:
package['status'] = 'archived'
elif 'deprecated' in json_file:
package['status'] = 'deprecated'
elif 'malicious' in json_file:
package['status'] = 'malicious'
else:
package['status'] = 'unknown'

# check for the existing package and only add if different
key = package['name']+"/"+package['type']
if key in packages_dict:
if packages_dict[key]['status'] == package['status'] and packages_dict[key]['description'] == package['description']:
print("Package already exists", key)
continue

# prepare the object for embedding
print("Generating data for", key)
vector_str = generate_vector_string(package)
vector = generate_embeddings(vector_str)

batch.add_object(properties=package, vector=vector)


def run_import():
client = weaviate.WeaviateClient(
embedded_options=EmbeddedOptions(
persistence_data_path="./weaviate_data",
grpc_port=50052
),
)
with client:
client.connect()
print('is_ready:', client.is_ready())

setup_schema(client)
add_data(client)


if __name__ == '__main__':
run_import()
Empty file added utils/__init__.py
Empty file.
40 changes: 40 additions & 0 deletions utils/embedding_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch import Tensor
import os
import warnings

# The transformers library internally is creating this warning, but does not
# impact our app. Safe to ignore.
warnings.filterwarnings(action='ignore', category=ResourceWarning)


# We won't have competing threads in this example app
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Initialize tokenizer and model for GTE-base
tokenizer = AutoTokenizer.from_pretrained('thenlper/gte-base')
model = AutoModel.from_pretrained('thenlper/gte-base')


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(
~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def generate_embeddings(text):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am implementing the embedding functionality in the inferencing engine. Once it is done, we can use that and remove this file.

inputs = tokenizer(text, return_tensors='pt',
max_length=512, truncation=True)
with torch.no_grad():
outputs = model(**inputs)

attention_mask = inputs['attention_mask']
embeddings = average_pool(outputs.last_hidden_state, attention_mask)

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)

return embeddings.numpy().tolist()[0]