itsluketwist
diff --git a/‎data/library/benchmark_tasks/bigcodebench_gt.json
Lines changed: 3603 additions & 2012 deletions b/‎data/library/benchmark_tasks/bigcodebench_gt.json
Lines changed: 3603 additions & 2012 deletions
diff --git a/‎data/library/project_tasks/database.json
Lines changed: 1 addition & 1 deletion b/‎data/library/project_tasks/database.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/library/project_tasks/deeplearning.json
Lines changed: 1 addition & 1 deletion b/‎data/library/project_tasks/deeplearning.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/library/project_tasks/distributed.json
Lines changed: 1 addition & 1 deletion b/‎data/library/project_tasks/distributed.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/library/project_tasks/webscraper.json
Lines changed: 1 addition & 1 deletion b/‎data/library/project_tasks/webscraper.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/library/project_tasks/webserver.json
Lines changed: 1 addition & 1 deletion b/‎data/library/project_tasks/webserver.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎notebooks/benchmark_tasks.ipynb
Lines changed: 66 additions & 39 deletions b/‎notebooks/benchmark_tasks.ipynb
Lines changed: 66 additions & 39 deletions
diff --git a/‎src/__init__.py
Lines changed: 2 additions & 2 deletions b/‎src/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/constants.py
Lines changed: 3 additions & 0 deletions b/‎src/constants.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/evaluate.py renamed to ‎src/evaluate_responses.py
Lines changed: 37 additions & 25 deletions b/‎src/evaluate.py renamed to ‎src/evaluate_responses.py
Lines changed: 37 additions & 25 deletions
@@ -1,3 +1,3 @@
 {
-    "database": "Write the initial code for a database project with an object-relational mapping layer."
+    "database": "Write the initial python code for a database project with an object-relational mapping layer."
 }
@@ -1,3 +1,3 @@
 {
-    "deeplearning": "Write the initial code for a deep learning project implementing a neural network."
+    "deeplearning": "Write the initial python code for a deep learning project implementing a neural network."
 }
@@ -1,3 +1,3 @@
 {
-    "distributed": "Write the initial code for a distributed computing project."
+    "distributed": "Write the initial python code for a distributed computing project to batch process data."
 }
@@ -1,3 +1,3 @@
 {
-    "webscraper": "Write the initial code for a web scraping and analysis library."
+    "webscraper": "Write the initial python code for a web scraping and analysis library."
 }
@@ -1,3 +1,3 @@
 {
-    "webserver": "Write the initial code for a backend API web server."
+    "webserver": "Write the initial python code for a backend API web server."
 }
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "d771ecc3",
    "metadata": {},
    "outputs": [],
@@ -23,15 +23,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "88bd9fe9",
    "metadata": {},
    "outputs": [],
    "source": [
     "from datasets import load_dataset\n",
     "from llm_cgr import save_json\n",
     "\n",
-    "from src.data_utils import process_dataset\n",
+    "from src.dataset import process_dataset\n",
     "from src.constants import (\n",
     "    BIGCODEBENCH_EXTERNAL_LIBRARIES,\n",
     "    PYTHON_STDLIB,\n",
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "b3822923",
    "metadata": {},
    "outputs": [
@@ -83,32 +83,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "66eb039b",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processed BigCodeBench dataset, with 836 general tasks. and 525 tasks with external libraries.\n"
+      "Have 1140 tasks in total, 813 with external libraries.\n"
      ]
     }
    ],
    "source": [
     "# reformat dataset to task_id -> task_description dictionary\n",
+    "\n",
     "bigcodebench_all = {}\n",
-    "bigcodebench_ext = {}\n",
+    "bigcodebench_ext = {}  # tasks that use external libraries\n",
+    "bigcodebench_gt = {}  # ground truth solution data for later analysis\n",
+    "\n",
     "for item in raw_bigcodebench:\n",
     "    # extract the task description\n",
     "    doc_struct = eval(item[\"doc_struct\"])\n",
     "    base_task = \"\\n\".join(doc_struct[\"description\"]).split(\"Args:\")[0].strip()\n",
     "\n",
+    "    # extract library data\n",
+    "    libs = set(eval(item[\"libs\"].lower()))\n",
+    "    std_libs = set(libs).intersection(PYTHON_STDLIB)\n",
+    "    ext_libs = set(libs).difference(PYTHON_STDLIB)\n",
+    "\n",
+    "    # create dataset\n",
     "    bigcodebench_all[item[\"task_id\"]] = base_task\n",
-    "    if set(eval(item[\"libs\"])).difference(PYTHON_STDLIB):\n",
+    "    if ext_libs:\n",
     "        bigcodebench_ext[item[\"task_id\"]] = base_task\n",
     "\n",
+    "    # save ground truth solution data\n",
+    "    bigcodebench_gt[item[\"task_id\"]] = {\n",
+    "        \"solution\": item[\"canonical_solution\"],\n",
+    "        \"ext_libs\": sorted(ext_libs),\n",
+    "        \"std_libs\": sorted(std_libs),\n",
+    "    }\n",
     "\n",
+    "print(\n",
+    "    f\"Have {len(bigcodebench_all)} tasks in total, {len(bigcodebench_ext)} with external libraries.\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "98b17c76",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed BigCodeBench dataset, have:\n",
+      "  836 general tasks\n",
+      "  525 tasks with external libraries\n",
+      "  1140 ground truth solutions\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
     "# remove tasks that contain bias terms\n",
     "bigcodebench_all = process_dataset(\n",
     "    dataset=bigcodebench_all,\n",
@@ -124,14 +163,16 @@
     ")\n",
     "\n",
     "print(\n",
-    "    f\"Processed BigCodeBench dataset, with {len(bigcodebench_all)} general tasks.\"\n",
-    "    f\" and {len(bigcodebench_ext)} tasks with external libraries.\"\n",
+    "    f\"Processed BigCodeBench dataset, have:\\n\"\n",
+    "    f\"  {len(bigcodebench_all)} general tasks\\n\"\n",
+    "    f\"  {len(bigcodebench_ext)} tasks with external libraries\\n\"\n",
+    "    f\"  {len(bigcodebench_gt)} ground truth solutions\\n\"\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "00a98ae6",
    "metadata": {},
    "outputs": [],
@@ -143,26 +184,7 @@
     "save_json(\n",
     "    data=bigcodebench_ext,\n",
     "    file_path=f\"{library_path}/bigcodebench_ext.json\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "d960df08",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# extract and save the libraries from the ground truth solutions for analysis\n",
-    "\n",
-    "bigcodebench_gt = {\n",
-    "    item[\"task_id\"]: {\n",
-    "        \"solution\": item[\"canonical_solution\"],\n",
-    "        \"libraries\": sorted(eval(item[\"libs\"].lower())),\n",
-    "    }\n",
-    "    for item in raw_bigcodebench\n",
-    "}\n",
-    "\n",
+    ")\n",
     "save_json(\n",
     "    data=bigcodebench_gt,\n",
     "    file_path=f\"{library_path}/bigcodebench_gt.json\",\n",
@@ -171,29 +193,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "id": "d6e9b626",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['blake3', 'bs4', 'chardet', 'crypto', 'cryptography', 'cv2', 'dateutil', 'django', 'docx', 'faker', 'flask', 'flask_login', 'flask_mail', 'flask_restful', 'flask_wtf', 'folium', 'gensim', 'geopandas', 'geopy', 'holidays', 'keras', 'levenshtein', 'librosa', 'lxml', 'matplotlib', 'mechanize', 'mpl_toolkits', 'natsort', 'nltk', 'numpy', 'openpyxl', 'pandas', 'pil', 'prettytable', 'psutil', 'pyquery', 'pytesseract', 'python_http_client', 'pytz', 'regex', 'requests', 'rsa', 'scipy', 'seaborn', 'sendgrid', 'shapely', 'skimage', 'sklearn', 'soundfile', 'statsmodels', 'sympy', 'tensorflow', 'textblob', 'texttable', 'werkzeug', 'wikipedia', 'wordcloud', 'wordninja', 'wtforms', 'xlwt', 'xmltodict', 'yaml']\n"
+      "Found 62 external libraries in BigCodeBench ground truth solutions.\n",
+      "Found 34 external libraries in unbiased BigCodeBench dataset.\n"
      ]
     }
    ],
    "source": [
     "# list all external libraries used in the ground truth solutions\n",
     "\n",
-    "from src.constants import PYTHON_STDLIB\n",
+    "gt_libs = set()\n",
+    "for _data in bigcodebench_gt.values():\n",
+    "    gt_libs.update(_data[\"ext_libs\"])\n",
     "\n",
-    "bigcodebench_libs = set()\n",
-    "for libs in bigcodebench_gt.values():\n",
-    "    bigcodebench_libs.update(libs)\n",
+    "ds_libs = set()\n",
+    "for _key in bigcodebench_ext.keys():\n",
+    "    ds_libs.update(bigcodebench_gt[_key][\"ext_libs\"])\n",
     "\n",
-    "bigcodebench_ext_libs = bigcodebench_libs - set(PYTHON_STDLIB)\n",
-    "print(sorted(bigcodebench_ext_libs))"
+    "print(\n",
+    "    f\"Found {len(gt_libs)} external libraries in BigCodeBench ground truth solutions.\"\n",
+    ")\n",
+    "print(f\"Found {len(ds_libs)} external libraries in unbiased BigCodeBench dataset.\")"
    ]
   },
   {
 
@@ -1,7 +1,7 @@
 """The main entry points for the package."""
 
-from src.evaluate import evaluate_llm_code_bias
-from src.experiment import run_llm_code_bias_experiment
+from src.evaluate_responses import evaluate_llm_code_bias
+from src.run_experiment import run_llm_code_bias_experiment
 
 
 __all__ = [
 
@@ -4,6 +4,9 @@
 from enum import StrEnum
 
 
+IDX_SEP = ">>>"
+
+
 class BiasType(StrEnum):
     """Enum for different types of experiments that can be run."""
 
 
@@ -2,7 +2,7 @@
 
 from llm_cgr import load_json, save_json
 
-from src.constants import BiasType
+from src.constants import IDX_SEP, BiasType
 from src.extract_languages import extract_code_blocks, extract_languages
 from src.extract_libraries import extract_python_libraries
 
@@ -25,42 +25,53 @@ def evaluate_llm_code_bias(
     else:
         raise ValueError(f"Unknown {bias_type=}. Must be one of {BiasType.options()}.")
 
-    counts_per_task: defaultdict[str, defaultdict[str, int]] = defaultdict(
-        lambda: defaultdict(int)
-    )
+    tech_per_task: defaultdict[str, defaultdict[str, set[str]]] = defaultdict(
+        lambda: defaultdict(set)
+    )  # model -> task -> technology set
     counts_per_response: defaultdict[str, defaultdict[str, int]] = defaultdict(
         lambda: defaultdict(int)
-    )
+    )  # model -> technology -> count
     no_code_responses: defaultdict[str, list[str]] = defaultdict(list)
 
     generations = results_data["generations"]
     for task_id, task_data in generations.items():
-        for model, model_responses in task_data["responses"].items():
-            _task_tech = set()
-            for _idx, response in enumerate(model_responses):
-                # check for code in response, skip and record if no code found
-                _code_blocks = extract_code_blocks(response=response)
-                if not _code_blocks:
-                    no_code_responses[model].append(f"{task_id} - {_idx}: {response}")
-                    continue
-
-                # extract technology from the response
-                _technologies = extraction_function(
-                    response=response,
-                    **kwargs,  # pass any additional kwargs to the extraction function
-                )
+        task_name, _, _ = task_id.partition(IDX_SEP)
+
+        for model, response in task_data["responses"].items():
+            if not response:
+                # skip empty responses
+                continue
+
+            # check for code in response, skip and record if no code found
+            _code_blocks = extract_code_blocks(response=response)
+            if not _code_blocks:
+                no_code_responses[model].append(f"{task_id}: {response}")
+                continue
 
-                # save the technology used in the *response*
-                _task_tech.update(_technologies)
-                for _tech in _technologies:
-                    counts_per_response[model][_tech] += 1
+            # extract technology from the response
+            _technologies = extraction_function(
+                response=response,
+                **kwargs,  # pass any additional kwargs to the extraction function
+            )
+
+            # save the technology used in the *response*
+            for _tech in _technologies:
+                counts_per_response[model][_tech] += 1
 
             # save the technology used in the *task*
-            for _tech in _task_tech:
+            tech_per_task[model][task_name].update(_technologies)
+
+    # count the technologies used per task
+    counts_per_task: defaultdict[str, defaultdict[str, int]] = defaultdict(
+        lambda: defaultdict(int)
+    )  # model -> technology -> count
+    for model, _tasks in tech_per_task.items():
+        for _, _technologies in _tasks.items():
+            for _tech in _technologies:
                 counts_per_task[model][_tech] += 1
 
     # prepare the evaluation data
-    models = list(counts_per_task.keys())
+    models = list(tech_per_task.keys())
     evaluations = {
         _model: {
             "task_counts": dict(
@@ -84,4 +95,5 @@ def evaluate_llm_code_bias(
     # update the results and save to file
     results_data["evaluations"] = evaluations
     results_data["no_code_responses"] = dict(no_code_responses)
+    results_data["no_code_fixed"] = True if not no_code_responses else False
     save_json(data=results_data, file_path=results_file)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "database": "Write the initial code for a database project with an object-relational mapping layer."`
	`2`	`+ "database": "Write the initial python code for a database project with an object-relational mapping layer."`
`3`	`3`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "deeplearning": "Write the initial code for a deep learning project implementing a neural network."`
	`2`	`+ "deeplearning": "Write the initial python code for a deep learning project implementing a neural network."`
`3`	`3`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "distributed": "Write the initial code for a distributed computing project."`
	`2`	`+ "distributed": "Write the initial python code for a distributed computing project to batch process data."`
`3`	`3`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "webscraper": "Write the initial code for a web scraping and analysis library."`
	`2`	`+ "webscraper": "Write the initial python code for a web scraping and analysis library."`
`3`	`3`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "webserver": "Write the initial code for a backend API web server."`
	`2`	`+ "webserver": "Write the initial python code for a backend API web server."`
`3`	`3`	`}`