|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import sqlite3 |
| 4 | +import subprocess |
| 5 | +from pathlib import Path |
| 6 | + |
| 7 | +import pandas |
| 8 | +from sqlalchemy import URL |
| 9 | +from typing_extensions import Optional |
| 10 | + |
| 11 | +from patchwork.common.tools.tool import Tool |
| 12 | + |
| 13 | + |
| 14 | +class In2CSVTool(Tool, tool_name="in2csv_tool", auto_register=False): |
| 15 | + def __init__(self, path: str): |
| 16 | + super().__init__() |
| 17 | + self.path = path |
| 18 | + |
| 19 | + @property |
| 20 | + def json_schema(self) -> dict: |
| 21 | + return { |
| 22 | + "name": "in2csv_tool", |
| 23 | + "description": """\ |
| 24 | +Convert common tabular data formats to CSV. |
| 25 | +
|
| 26 | +optional arguments: |
| 27 | + --reset-dimensions Ignore the sheet dimensions provided by the XLSX file. |
| 28 | + --encoding-xls ENCODING_XLS |
| 29 | + Specify the encoding of the input XLS file. |
| 30 | + -y SNIFF_LIMIT, --snifflimit SNIFF_LIMIT |
| 31 | + Limit CSV dialect sniffing to the specified number of |
| 32 | + bytes. Specify "0" to disable sniffing entirely, or |
| 33 | + "-1" to sniff the entire file. |
| 34 | + -I, --no-inference Disable type inference (and --locale, --date-format, |
| 35 | + --datetime-format, --no-leading-zeroes) when parsing |
| 36 | + CSV input. |
| 37 | +""", |
| 38 | + "input_schema": { |
| 39 | + "type": "object", |
| 40 | + "properties": { |
| 41 | + "files": { |
| 42 | + "type": "array", |
| 43 | + "items": {"type": "string"}, |
| 44 | + "description": "The CSV file(s) to operate on", |
| 45 | + }, |
| 46 | + "args": { |
| 47 | + "type": "array", |
| 48 | + "items": {"type": "string"}, |
| 49 | + "description": "The args to run with", |
| 50 | + }, |
| 51 | + }, |
| 52 | + "required": ["files"], |
| 53 | + }, |
| 54 | + } |
| 55 | + |
| 56 | + def execute(self, files: list[str], args: Optional[list[str]] = None) -> str: |
| 57 | + args = args or [] |
| 58 | + |
| 59 | + original_csvs = set() |
| 60 | + for p in Path(self.path).iterdir(): |
| 61 | + if p.suffix == ".csv": |
| 62 | + original_csvs.add(p.name) |
| 63 | + |
| 64 | + p = subprocess.run( |
| 65 | + ["in2csv", *files, *args, "--write-sheets", "-", "--use-sheet-names"], |
| 66 | + capture_output=True, |
| 67 | + text=True, |
| 68 | + cwd=self.path, |
| 69 | + ) |
| 70 | + if p.returncode != 0: |
| 71 | + return "ERROR:\n" + p.stderr |
| 72 | + |
| 73 | + rv = "Files converted to CSV:" |
| 74 | + for p in Path(self.path).iterdir(): |
| 75 | + if p.suffix == ".csv" and p.name not in original_csvs: |
| 76 | + rv += f"\n* {p}" |
| 77 | + |
| 78 | + return rv |
| 79 | + |
| 80 | + |
| 81 | +class CSVSQLTool(Tool, tool_name="csvsql_tool", auto_register=False): |
| 82 | + def __init__(self, path: str, tmp_path: str): |
| 83 | + super().__init__() |
| 84 | + self.path = path |
| 85 | + self.tmp_path = tmp_path |
| 86 | + |
| 87 | + @property |
| 88 | + def json_schema(self) -> dict: |
| 89 | + return { |
| 90 | + "name": "csvsql_tool", |
| 91 | + "description": """\ |
| 92 | +Execute SQL query directly on csv files. The name of the csv files can be referenced as table in the SQL query |
| 93 | +
|
| 94 | +If the output is larger than 5000 characters, the remaining characters are replaced with <TRUNCATED>. |
| 95 | +""", |
| 96 | + "input_schema": { |
| 97 | + "type": "object", |
| 98 | + "properties": { |
| 99 | + "files": { |
| 100 | + "type": "array", |
| 101 | + "items": {"type": "string"}, |
| 102 | + "description": "The CSV file(s) to operate on", |
| 103 | + }, |
| 104 | + "query": { |
| 105 | + "type": "string", |
| 106 | + "description": "SQL query to execute", |
| 107 | + }, |
| 108 | + }, |
| 109 | + "required": ["files", "query"], |
| 110 | + }, |
| 111 | + } |
| 112 | + |
| 113 | + def execute(self, files: list[str], query: str) -> str: |
| 114 | + db_path = (Path(self.tmp_path) / "tmp.db").resolve() |
| 115 | + db_url = URL.create(drivername="sqlite", host="/" + str(db_path)).render_as_string() |
| 116 | + |
| 117 | + files_to_insert = [] |
| 118 | + if db_path.is_file(): |
| 119 | + with sqlite3.connect(str(db_path)) as conn: |
| 120 | + for file in files: |
| 121 | + res = conn.execute( |
| 122 | + f"SELECT 1 from {file.removesuffix('.csv')}", |
| 123 | + ) |
| 124 | + if res.fetchone() is None: |
| 125 | + files_to_insert.append(file) |
| 126 | + else: |
| 127 | + files_to_insert = files |
| 128 | + |
| 129 | + if len(files_to_insert) > 0: |
| 130 | + p = subprocess.run( |
| 131 | + ["csvsql", *files_to_insert, "--db", db_url, "--insert"], capture_output=True, text=True, cwd=self.path |
| 132 | + ) |
| 133 | + if p.returncode != 0: |
| 134 | + return "ERROR:\n" + p.stderr |
| 135 | + |
| 136 | + with sqlite3.connect(str(db_path)) as conn: |
| 137 | + pandas_df = pandas.read_sql_query(query, conn) |
| 138 | + rv = pandas_df.to_csv() |
| 139 | + |
| 140 | + if len(rv) > 5000: |
| 141 | + return rv[:5000] + "<TRUNCATED>" |
| 142 | + return rv |
0 commit comments