diff --git a/mfr/extensions/tabular/libs/__init__.py b/mfr/extensions/tabular/libs/__init__.py index 609dc7a35..14e9cb375 100644 --- a/mfr/extensions/tabular/libs/__init__.py +++ b/mfr/extensions/tabular/libs/__init__.py @@ -8,6 +8,11 @@ def csv_stdlib(): return csv_stdlib +def tsv_stdlib(): + from ..libs.stdlib_tools import tsv_stdlib + return tsv_stdlib + + def csv_pandas(): from ..libs.panda_tools import csv_pandas return csv_pandas diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py index 542d5744e..5c0856015 100644 --- a/mfr/extensions/tabular/libs/stdlib_tools.py +++ b/mfr/extensions/tabular/libs/stdlib_tools.py @@ -1,57 +1,91 @@ import re import csv +from http import HTTPStatus -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError from mfr.extensions.tabular import utilities +from mfr.extensions.tabular.exceptions import (EmptyTableError, + TabularRendererError) def csv_stdlib(fp): - """Read and convert a csv file to JSON format using the python standard library - :param fp: File pointer object - :return: tuple of table headers and data - """ - data = fp.read(2048) + data = fp.seek(2048) fp.seek(0) + # set the dialect instead of sniffing for it. + # sniffing can cause things like spaces or characters to be the delimiter + dialect = csv.excel + try: + _set_dialect_quote_attrs(dialect, data) + except: + # if this errors it is not an exception + pass + + reader = csv.DictReader(fp, dialect=dialect) + return parse_stdlib(reader) +def tsv_stdlib(fp): + data = fp.seek(2048) + fp.seek(0) + # set the dialect instead of sniffing for it. + # sniffing can cause things like spaces or characters to be the delimiter + dialect = csv.excel_tab try: - dialect = csv.Sniffer().sniff(data) - except csv.Error: - dialect = csv.excel - else: _set_dialect_quote_attrs(dialect, data) + except: + # if this errors it is not an exception + pass reader = csv.DictReader(fp, dialect=dialect) + return parse_stdlib(reader) + +def parse_stdlib(reader): + """Read and convert a csv like file to JSON format using the python standard library + :param fp: File pointer object + :return: tuple of table headers and data + """ columns = [] # update the reader field names to avoid duplicate column names when performing row extraction - for idx, fieldname in enumerate(reader.fieldnames or []): - column_count = sum(1 for column in columns if fieldname == column['name']) - if column_count: - unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) - reader.fieldnames[idx] = unique_fieldname - else: - unique_fieldname = fieldname - columns.append({ - 'id': unique_fieldname, - 'field': unique_fieldname, - 'name': fieldname, - 'sortable': True, - }) - try: + for idx, fieldname in enumerate(reader.fieldnames or []): + column_count = sum(1 for column in columns if fieldname == column['name']) + if column_count: + unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) + reader.fieldnames[idx] = unique_fieldname + else: + unique_fieldname = fieldname + columns.append({ + 'id': unique_fieldname, + 'field': unique_fieldname, + 'name': fieldname, + 'sortable': True, + }) + rows = [row for row in reader] except csv.Error as e: if any("field larger than field limit" in errorMsg for errorMsg in e.args): raise TabularRendererError( 'This file contains a field too large to render. ' 'Please download and view it locally.', - code=400, + code=HTTPStatus.BAD_REQUEST, extension='csv', ) from e else: - raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e + raise TabularRendererError('Cannot render file as csv/tsv. ' + 'The file may be empty or corrupt', + code=HTTPStatus.BAD_REQUEST, + extension='csv') from e + + # Outside other except because the `if any` line causes more errors to be raised + # on certain exceptions + except Exception as e: + raise TabularRendererError('Cannot render file as csv/tsv. ' + 'The file may be empty or corrupt', + code=HTTPStatus.BAD_REQUEST, + extension='csv') from e if not columns and not rows: - raise EmptyTableError('Table empty or corrupt.', extension='csv') + raise EmptyTableError('Cannot render file as csv/tsv. ' + 'The file may be empty or corrupt', + code=HTTPStatus.BAD_REQUEST, extension='csv') return {'Sheet 1': (columns, rows)} diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py index 87d46e885..8895e3cff 100644 --- a/mfr/extensions/tabular/settings.py +++ b/mfr/extensions/tabular/settings.py @@ -10,7 +10,7 @@ LIBS = config.get('LIBS', { '.csv': [libs.csv_stdlib], - '.tsv': [libs.csv_stdlib], + '.tsv': [libs.tsv_stdlib], '.gsheet': [libs.xlsx_xlrd], '.xlsx': [libs.xlsx_xlrd], '.xls': [libs.xlsx_xlrd], diff --git a/tests/extensions/tabular/files/invalid_null.csv b/tests/extensions/tabular/files/invalid_null.csv new file mode 100644 index 000000000..c25eed4af Binary files /dev/null and b/tests/extensions/tabular/files/invalid_null.csv differ diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py new file mode 100644 index 000000000..3d958eb01 --- /dev/null +++ b/tests/extensions/tabular/test_stdlib_tools.py @@ -0,0 +1,58 @@ +import os +from http import HTTPStatus +from collections import OrderedDict + +import pytest + +from mfr.extensions.tabular.libs import stdlib_tools +from mfr.extensions.tabular.exceptions import(EmptyTableError, + TabularRendererError) + +BASE = os.path.dirname(os.path.abspath(__file__)) + + +class TestTabularStdlibTools: + + def test_csv_stdlib(self): + with open(os.path.join(BASE, 'files', 'test.csv')) as fp: + sheets = stdlib_tools.csv_stdlib(fp) + + sheet = sheets.popitem()[1] + assert sheet[0] == [ + {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, + {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, + {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} + ] + assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')]) + assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) + + def test_tsv_stdlib(self): + with open(os.path.join(BASE, 'files', 'test.tsv')) as fp: + sheets = stdlib_tools.tsv_stdlib(fp) + + sheet = sheets.popitem()[1] + assert sheet[0] == [ + {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, + {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, + {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} + ] + assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')]) + assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) + + def test_tsv_stdlib_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp: + with pytest.raises(EmptyTableError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST + + def test_csv_stdlib_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp: + with pytest.raises(EmptyTableError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST + + def test_csv_stdlib_other_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp: + with pytest.raises(TabularRendererError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST