Skip to content

Commit 0b476bf

Browse files
committed
Simplify with restructure.
1 parent 950e60f commit 0b476bf

15 files changed

+3796
-2158
lines changed

data/library/benchmark_tasks/bigcodebench_gt.json

Lines changed: 3603 additions & 2012 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"database": "Write the initial code for a database project with an object-relational mapping layer."
2+
"database": "Write the initial python code for a database project with an object-relational mapping layer."
33
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"deeplearning": "Write the initial code for a deep learning project implementing a neural network."
2+
"deeplearning": "Write the initial python code for a deep learning project implementing a neural network."
33
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"distributed": "Write the initial code for a distributed computing project."
2+
"distributed": "Write the initial python code for a distributed computing project to batch process data."
33
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"webscraper": "Write the initial code for a web scraping and analysis library."
2+
"webscraper": "Write the initial python code for a web scraping and analysis library."
33
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"webserver": "Write the initial code for a backend API web server."
2+
"webserver": "Write the initial python code for a backend API web server."
33
}

notebooks/benchmark_tasks.ipynb

Lines changed: 66 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
},
1313
{
1414
"cell_type": "code",
15-
"execution_count": 1,
15+
"execution_count": 2,
1616
"id": "d771ecc3",
1717
"metadata": {},
1818
"outputs": [],
@@ -23,15 +23,15 @@
2323
},
2424
{
2525
"cell_type": "code",
26-
"execution_count": 2,
26+
"execution_count": 3,
2727
"id": "88bd9fe9",
2828
"metadata": {},
2929
"outputs": [],
3030
"source": [
3131
"from datasets import load_dataset\n",
3232
"from llm_cgr import save_json\n",
3333
"\n",
34-
"from src.data_utils import process_dataset\n",
34+
"from src.dataset import process_dataset\n",
3535
"from src.constants import (\n",
3636
" BIGCODEBENCH_EXTERNAL_LIBRARIES,\n",
3737
" PYTHON_STDLIB,\n",
@@ -57,7 +57,7 @@
5757
},
5858
{
5959
"cell_type": "code",
60-
"execution_count": 3,
60+
"execution_count": 4,
6161
"id": "b3822923",
6262
"metadata": {},
6363
"outputs": [
@@ -83,32 +83,71 @@
8383
},
8484
{
8585
"cell_type": "code",
86-
"execution_count": 4,
86+
"execution_count": 6,
8787
"id": "66eb039b",
8888
"metadata": {},
8989
"outputs": [
9090
{
9191
"name": "stdout",
9292
"output_type": "stream",
9393
"text": [
94-
"Processed BigCodeBench dataset, with 836 general tasks. and 525 tasks with external libraries.\n"
94+
"Have 1140 tasks in total, 813 with external libraries.\n"
9595
]
9696
}
9797
],
9898
"source": [
9999
"# reformat dataset to task_id -> task_description dictionary\n",
100+
"\n",
100101
"bigcodebench_all = {}\n",
101-
"bigcodebench_ext = {}\n",
102+
"bigcodebench_ext = {} # tasks that use external libraries\n",
103+
"bigcodebench_gt = {} # ground truth solution data for later analysis\n",
104+
"\n",
102105
"for item in raw_bigcodebench:\n",
103106
" # extract the task description\n",
104107
" doc_struct = eval(item[\"doc_struct\"])\n",
105108
" base_task = \"\\n\".join(doc_struct[\"description\"]).split(\"Args:\")[0].strip()\n",
106109
"\n",
110+
" # extract library data\n",
111+
" libs = set(eval(item[\"libs\"].lower()))\n",
112+
" std_libs = set(libs).intersection(PYTHON_STDLIB)\n",
113+
" ext_libs = set(libs).difference(PYTHON_STDLIB)\n",
114+
"\n",
115+
" # create dataset\n",
107116
" bigcodebench_all[item[\"task_id\"]] = base_task\n",
108-
" if set(eval(item[\"libs\"])).difference(PYTHON_STDLIB):\n",
117+
" if ext_libs:\n",
109118
" bigcodebench_ext[item[\"task_id\"]] = base_task\n",
110119
"\n",
120+
" # save ground truth solution data\n",
121+
" bigcodebench_gt[item[\"task_id\"]] = {\n",
122+
" \"solution\": item[\"canonical_solution\"],\n",
123+
" \"ext_libs\": sorted(ext_libs),\n",
124+
" \"std_libs\": sorted(std_libs),\n",
125+
" }\n",
111126
"\n",
127+
"print(\n",
128+
" f\"Have {len(bigcodebench_all)} tasks in total, {len(bigcodebench_ext)} with external libraries.\"\n",
129+
")"
130+
]
131+
},
132+
{
133+
"cell_type": "code",
134+
"execution_count": 7,
135+
"id": "98b17c76",
136+
"metadata": {},
137+
"outputs": [
138+
{
139+
"name": "stdout",
140+
"output_type": "stream",
141+
"text": [
142+
"Processed BigCodeBench dataset, have:\n",
143+
" 836 general tasks\n",
144+
" 525 tasks with external libraries\n",
145+
" 1140 ground truth solutions\n",
146+
"\n"
147+
]
148+
}
149+
],
150+
"source": [
112151
"# remove tasks that contain bias terms\n",
113152
"bigcodebench_all = process_dataset(\n",
114153
" dataset=bigcodebench_all,\n",
@@ -124,14 +163,16 @@
124163
")\n",
125164
"\n",
126165
"print(\n",
127-
" f\"Processed BigCodeBench dataset, with {len(bigcodebench_all)} general tasks.\"\n",
128-
" f\" and {len(bigcodebench_ext)} tasks with external libraries.\"\n",
166+
" f\"Processed BigCodeBench dataset, have:\\n\"\n",
167+
" f\" {len(bigcodebench_all)} general tasks\\n\"\n",
168+
" f\" {len(bigcodebench_ext)} tasks with external libraries\\n\"\n",
169+
" f\" {len(bigcodebench_gt)} ground truth solutions\\n\"\n",
129170
")"
130171
]
131172
},
132173
{
133174
"cell_type": "code",
134-
"execution_count": 5,
175+
"execution_count": null,
135176
"id": "00a98ae6",
136177
"metadata": {},
137178
"outputs": [],
@@ -143,26 +184,7 @@
143184
"save_json(\n",
144185
" data=bigcodebench_ext,\n",
145186
" file_path=f\"{library_path}/bigcodebench_ext.json\",\n",
146-
")"
147-
]
148-
},
149-
{
150-
"cell_type": "code",
151-
"execution_count": 25,
152-
"id": "d960df08",
153-
"metadata": {},
154-
"outputs": [],
155-
"source": [
156-
"# extract and save the libraries from the ground truth solutions for analysis\n",
157-
"\n",
158-
"bigcodebench_gt = {\n",
159-
" item[\"task_id\"]: {\n",
160-
" \"solution\": item[\"canonical_solution\"],\n",
161-
" \"libraries\": sorted(eval(item[\"libs\"].lower())),\n",
162-
" }\n",
163-
" for item in raw_bigcodebench\n",
164-
"}\n",
165-
"\n",
187+
")\n",
166188
"save_json(\n",
167189
" data=bigcodebench_gt,\n",
168190
" file_path=f\"{library_path}/bigcodebench_gt.json\",\n",
@@ -171,29 +193,34 @@
171193
},
172194
{
173195
"cell_type": "code",
174-
"execution_count": 8,
196+
"execution_count": 10,
175197
"id": "d6e9b626",
176198
"metadata": {},
177199
"outputs": [
178200
{
179201
"name": "stdout",
180202
"output_type": "stream",
181203
"text": [
182-
"['blake3', 'bs4', 'chardet', 'crypto', 'cryptography', 'cv2', 'dateutil', 'django', 'docx', 'faker', 'flask', 'flask_login', 'flask_mail', 'flask_restful', 'flask_wtf', 'folium', 'gensim', 'geopandas', 'geopy', 'holidays', 'keras', 'levenshtein', 'librosa', 'lxml', 'matplotlib', 'mechanize', 'mpl_toolkits', 'natsort', 'nltk', 'numpy', 'openpyxl', 'pandas', 'pil', 'prettytable', 'psutil', 'pyquery', 'pytesseract', 'python_http_client', 'pytz', 'regex', 'requests', 'rsa', 'scipy', 'seaborn', 'sendgrid', 'shapely', 'skimage', 'sklearn', 'soundfile', 'statsmodels', 'sympy', 'tensorflow', 'textblob', 'texttable', 'werkzeug', 'wikipedia', 'wordcloud', 'wordninja', 'wtforms', 'xlwt', 'xmltodict', 'yaml']\n"
204+
"Found 62 external libraries in BigCodeBench ground truth solutions.\n",
205+
"Found 34 external libraries in unbiased BigCodeBench dataset.\n"
183206
]
184207
}
185208
],
186209
"source": [
187210
"# list all external libraries used in the ground truth solutions\n",
188211
"\n",
189-
"from src.constants import PYTHON_STDLIB\n",
212+
"gt_libs = set()\n",
213+
"for _data in bigcodebench_gt.values():\n",
214+
" gt_libs.update(_data[\"ext_libs\"])\n",
190215
"\n",
191-
"bigcodebench_libs = set()\n",
192-
"for libs in bigcodebench_gt.values():\n",
193-
" bigcodebench_libs.update(libs)\n",
216+
"ds_libs = set()\n",
217+
"for _key in bigcodebench_ext.keys():\n",
218+
" ds_libs.update(bigcodebench_gt[_key][\"ext_libs\"])\n",
194219
"\n",
195-
"bigcodebench_ext_libs = bigcodebench_libs - set(PYTHON_STDLIB)\n",
196-
"print(sorted(bigcodebench_ext_libs))"
220+
"print(\n",
221+
" f\"Found {len(gt_libs)} external libraries in BigCodeBench ground truth solutions.\"\n",
222+
")\n",
223+
"print(f\"Found {len(ds_libs)} external libraries in unbiased BigCodeBench dataset.\")"
197224
]
198225
},
199226
{

src/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""The main entry points for the package."""
22

3-
from src.evaluate import evaluate_llm_code_bias
4-
from src.experiment import run_llm_code_bias_experiment
3+
from src.evaluate_responses import evaluate_llm_code_bias
4+
from src.run_experiment import run_llm_code_bias_experiment
55

66

77
__all__ = [

src/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
from enum import StrEnum
55

66

7+
IDX_SEP = ">>>"
8+
9+
710
class BiasType(StrEnum):
811
"""Enum for different types of experiments that can be run."""
912

src/evaluate.py renamed to src/evaluate_responses.py

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from llm_cgr import load_json, save_json
44

5-
from src.constants import BiasType
5+
from src.constants import IDX_SEP, BiasType
66
from src.extract_languages import extract_code_blocks, extract_languages
77
from src.extract_libraries import extract_python_libraries
88

@@ -25,42 +25,53 @@ def evaluate_llm_code_bias(
2525
else:
2626
raise ValueError(f"Unknown {bias_type=}. Must be one of {BiasType.options()}.")
2727

28-
counts_per_task: defaultdict[str, defaultdict[str, int]] = defaultdict(
29-
lambda: defaultdict(int)
30-
)
28+
tech_per_task: defaultdict[str, defaultdict[str, set[str]]] = defaultdict(
29+
lambda: defaultdict(set)
30+
) # model -> task -> technology set
3131
counts_per_response: defaultdict[str, defaultdict[str, int]] = defaultdict(
3232
lambda: defaultdict(int)
33-
)
33+
) # model -> technology -> count
3434
no_code_responses: defaultdict[str, list[str]] = defaultdict(list)
3535

3636
generations = results_data["generations"]
3737
for task_id, task_data in generations.items():
38-
for model, model_responses in task_data["responses"].items():
39-
_task_tech = set()
40-
for _idx, response in enumerate(model_responses):
41-
# check for code in response, skip and record if no code found
42-
_code_blocks = extract_code_blocks(response=response)
43-
if not _code_blocks:
44-
no_code_responses[model].append(f"{task_id} - {_idx}: {response}")
45-
continue
46-
47-
# extract technology from the response
48-
_technologies = extraction_function(
49-
response=response,
50-
**kwargs, # pass any additional kwargs to the extraction function
51-
)
38+
task_name, _, _ = task_id.partition(IDX_SEP)
39+
40+
for model, response in task_data["responses"].items():
41+
if not response:
42+
# skip empty responses
43+
continue
44+
45+
# check for code in response, skip and record if no code found
46+
_code_blocks = extract_code_blocks(response=response)
47+
if not _code_blocks:
48+
no_code_responses[model].append(f"{task_id}: {response}")
49+
continue
5250

53-
# save the technology used in the *response*
54-
_task_tech.update(_technologies)
55-
for _tech in _technologies:
56-
counts_per_response[model][_tech] += 1
51+
# extract technology from the response
52+
_technologies = extraction_function(
53+
response=response,
54+
**kwargs, # pass any additional kwargs to the extraction function
55+
)
56+
57+
# save the technology used in the *response*
58+
for _tech in _technologies:
59+
counts_per_response[model][_tech] += 1
5760

5861
# save the technology used in the *task*
59-
for _tech in _task_tech:
62+
tech_per_task[model][task_name].update(_technologies)
63+
64+
# count the technologies used per task
65+
counts_per_task: defaultdict[str, defaultdict[str, int]] = defaultdict(
66+
lambda: defaultdict(int)
67+
) # model -> technology -> count
68+
for model, _tasks in tech_per_task.items():
69+
for _, _technologies in _tasks.items():
70+
for _tech in _technologies:
6071
counts_per_task[model][_tech] += 1
6172

6273
# prepare the evaluation data
63-
models = list(counts_per_task.keys())
74+
models = list(tech_per_task.keys())
6475
evaluations = {
6576
_model: {
6677
"task_counts": dict(
@@ -84,4 +95,5 @@ def evaluate_llm_code_bias(
8495
# update the results and save to file
8596
results_data["evaluations"] = evaluations
8697
results_data["no_code_responses"] = dict(no_code_responses)
98+
results_data["no_code_fixed"] = True if not no_code_responses else False
8799
save_json(data=results_data, file_path=results_file)

0 commit comments

Comments
 (0)