diff --git a/.github/workflows/import_packages.yml b/.github/workflows/import_packages.yml index 3da31b63..e7ada4d4 100644 --- a/.github/workflows/import_packages.yml +++ b/.github/workflows/import_packages.yml @@ -47,6 +47,7 @@ jobs: MALICIOUS_KEY=$(jq -r '.latest.malicious_packages' manifest.json) DEPRECATED_KEY=$(jq -r '.latest.deprecated_packages' manifest.json) ARCHIVED_KEY=$(jq -r '.latest.archived_packages' manifest.json) + VULNERABLE_KEY=$(jq -r '.latest.vulnerable_packages' manifest.json) echo "Malicious key: $MALICIOUS_KEY" echo "Deprecated key: $DEPRECATED_KEY" @@ -58,6 +59,7 @@ jobs: aws s3 cp s3://codegate-data-prod/$MALICIOUS_KEY /tmp/jsonl-files/malicious.jsonl --region $AWS_REGION aws s3 cp s3://codegate-data-prod/$DEPRECATED_KEY /tmp/jsonl-files/deprecated.jsonl --region $AWS_REGION aws s3 cp s3://codegate-data-prod/$ARCHIVED_KEY /tmp/jsonl-files/archived.jsonl --region $AWS_REGION + aws s3 cp s3://codegate-data-prod/$VULNERABLE_KEY /tmp/jsonl-files/vulnerable.jsonl --region $AWS_REGION - name: Install Poetry run: | diff --git a/scripts/import_packages.py b/scripts/import_packages.py index 1cfdfd1e..c4a2dad1 100644 --- a/scripts/import_packages.py +++ b/scripts/import_packages.py @@ -20,6 +20,7 @@ def __init__(self, jsonl_dir="data", vec_db_path="./sqlite_data/vectordb.db"): os.path.join(jsonl_dir, "archived.jsonl"), os.path.join(jsonl_dir, "deprecated.jsonl"), os.path.join(jsonl_dir, "malicious.jsonl"), + os.path.join(jsonl_dir, "vulnerable.jsonl"), ] self.conn = self._get_connection() Config.load() # Load the configuration @@ -48,13 +49,41 @@ def setup_schema(self): """ ) + # table for packages that has at least one vulnerability high or critical + cursor.execute( + """ + CREATE TABLE cve_packages ( + name TEXT NOT NULL, + version TEXT NOT NULL, + type TEXT NOT NULL + ) + """ + ) + # Create indexes for faster querying cursor.execute("CREATE INDEX IF NOT EXISTS idx_name ON packages(name)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON packages(type)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_status ON packages(status)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_pkg_cve_name ON cve_packages(name)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_pkg_cve_type ON cve_packages(type)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_pkg_cve_version ON cve_packages(version)") self.conn.commit() + async def process_cve_packages(self, package): + cursor = self.conn.cursor() + cursor.execute( + """ + INSERT INTO cve_packages (name, version, type) VALUES (?, ?, ?) + """, + ( + package["name"], + package["version"], + package["type"], + ), + ) + self.conn.commit() + async def process_package(self, package): vector_str = generate_vector_string(package) vector = await self.inference_engine.embed( @@ -101,14 +130,19 @@ async def add_data(self): package["status"] = json_file.split("/")[-1].split(".")[0] key = f"{package['name']}/{package['type']}" - if key in existing_packages and existing_packages[key] == { - "status": package["status"], - "description": package["description"], - }: - print("Package already exists", key) - continue - - await self.process_package(package) + if package["status"] == "vulnerable": + # Process vulnerable packages using the cve flow + await self.process_cve_packages(package) + else: + # For non-vulnerable packages, check for duplicates and process normally + if key in existing_packages and existing_packages[key] == { + "status": package["status"], + "description": package["description"], + }: + print("Package already exists", key) + continue + + await self.process_package(package) async def run_import(self): self.setup_schema()