-
Notifications
You must be signed in to change notification settings - Fork 71
[SVCS-531] Separate csv and tsv function and remove use of sniff #285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,57 +1,91 @@ | ||
import re | ||
import csv | ||
from http import HTTPStatus | ||
|
||
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError | ||
from mfr.extensions.tabular import utilities | ||
from mfr.extensions.tabular.exceptions import (EmptyTableError, | ||
TabularRendererError) | ||
|
||
|
||
def csv_stdlib(fp): | ||
"""Read and convert a csv file to JSON format using the python standard library | ||
:param fp: File pointer object | ||
:return: tuple of table headers and data | ||
""" | ||
data = fp.read(2048) | ||
data = fp.seek(2048) | ||
fp.seek(0) | ||
# set the dialect instead of sniffing for it. | ||
# sniffing can cause things like spaces or characters to be the delimiter | ||
dialect = csv.excel | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm. I'm not sure how I feel about this. I like that it solves the issue of really long lines, but it bugs me a bit that we're throwing out support for alternative delimiters. If we use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Complete. PR308 |
||
try: | ||
_set_dialect_quote_attrs(dialect, data) | ||
except: | ||
# if this errors it is not an exception | ||
pass | ||
|
||
reader = csv.DictReader(fp, dialect=dialect) | ||
return parse_stdlib(reader) | ||
|
||
def tsv_stdlib(fp): | ||
data = fp.seek(2048) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Complete. PR308 |
||
fp.seek(0) | ||
# set the dialect instead of sniffing for it. | ||
# sniffing can cause things like spaces or characters to be the delimiter | ||
dialect = csv.excel_tab | ||
try: | ||
dialect = csv.Sniffer().sniff(data) | ||
except csv.Error: | ||
dialect = csv.excel | ||
else: | ||
_set_dialect_quote_attrs(dialect, data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think I've ever seen a tsv with quoting in it. Has anyone else? Maybe we leave quoting alone until it's reported as an issue. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Complete. PR308 |
||
except: | ||
# if this errors it is not an exception | ||
pass | ||
|
||
reader = csv.DictReader(fp, dialect=dialect) | ||
return parse_stdlib(reader) | ||
|
||
def parse_stdlib(reader): | ||
"""Read and convert a csv like file to JSON format using the python standard library | ||
:param fp: File pointer object | ||
:return: tuple of table headers and data | ||
""" | ||
columns = [] | ||
# update the reader field names to avoid duplicate column names when performing row extraction | ||
for idx, fieldname in enumerate(reader.fieldnames or []): | ||
column_count = sum(1 for column in columns if fieldname == column['name']) | ||
if column_count: | ||
unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) | ||
reader.fieldnames[idx] = unique_fieldname | ||
else: | ||
unique_fieldname = fieldname | ||
columns.append({ | ||
'id': unique_fieldname, | ||
'field': unique_fieldname, | ||
'name': fieldname, | ||
'sortable': True, | ||
}) | ||
|
||
try: | ||
for idx, fieldname in enumerate(reader.fieldnames or []): | ||
column_count = sum(1 for column in columns if fieldname == column['name']) | ||
if column_count: | ||
unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) | ||
reader.fieldnames[idx] = unique_fieldname | ||
else: | ||
unique_fieldname = fieldname | ||
columns.append({ | ||
'id': unique_fieldname, | ||
'field': unique_fieldname, | ||
'name': fieldname, | ||
'sortable': True, | ||
}) | ||
|
||
rows = [row for row in reader] | ||
except csv.Error as e: | ||
if any("field larger than field limit" in errorMsg for errorMsg in e.args): | ||
raise TabularRendererError( | ||
'This file contains a field too large to render. ' | ||
'Please download and view it locally.', | ||
code=400, | ||
code=HTTPStatus.BAD_REQUEST, | ||
extension='csv', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since both the csv and tsv parser call this, can we make sure the correct extension is being passed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Complete. PR308 |
||
) from e | ||
else: | ||
raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e | ||
raise TabularRendererError('Cannot render file as csv/tsv. ' | ||
'The file may be empty or corrupt', | ||
code=HTTPStatus.BAD_REQUEST, | ||
extension='csv') from e | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is identical to the error raised in the next stanza, could we just throw the error instead? |
||
# Outside other except because the `if any` line causes more errors to be raised | ||
# on certain exceptions | ||
except Exception as e: | ||
raise TabularRendererError('Cannot render file as csv/tsv. ' | ||
'The file may be empty or corrupt', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: indentation is weird here. |
||
code=HTTPStatus.BAD_REQUEST, | ||
extension='csv') from e | ||
|
||
if not columns and not rows: | ||
raise EmptyTableError('Table empty or corrupt.', extension='csv') | ||
raise EmptyTableError('Cannot render file as csv/tsv. ' | ||
'The file may be empty or corrupt', | ||
code=HTTPStatus.BAD_REQUEST, extension='csv') | ||
|
||
return {'Sheet 1': (columns, rows)} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import os | ||
from http import HTTPStatus | ||
from collections import OrderedDict | ||
|
||
import pytest | ||
|
||
from mfr.extensions.tabular.libs import stdlib_tools | ||
from mfr.extensions.tabular.exceptions import(EmptyTableError, | ||
TabularRendererError) | ||
|
||
BASE = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
|
||
class TestTabularStdlibTools: | ||
|
||
def test_csv_stdlib(self): | ||
with open(os.path.join(BASE, 'files', 'test.csv')) as fp: | ||
sheets = stdlib_tools.csv_stdlib(fp) | ||
|
||
sheet = sheets.popitem()[1] | ||
assert sheet[0] == [ | ||
{'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, | ||
{'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, | ||
{'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} | ||
] | ||
assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')]) | ||
assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) | ||
|
||
def test_tsv_stdlib(self): | ||
with open(os.path.join(BASE, 'files', 'test.tsv')) as fp: | ||
sheets = stdlib_tools.tsv_stdlib(fp) | ||
|
||
sheet = sheets.popitem()[1] | ||
assert sheet[0] == [ | ||
{'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, | ||
{'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, | ||
{'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} | ||
] | ||
assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')]) | ||
assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) | ||
|
||
def test_tsv_stdlib_exception_raises(self): | ||
with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp: | ||
with pytest.raises(EmptyTableError) as e: | ||
stdlib_tools.tsv_stdlib(fp) | ||
assert e.value.code == HTTPStatus.BAD_REQUEST | ||
|
||
def test_csv_stdlib_exception_raises(self): | ||
with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp: | ||
with pytest.raises(EmptyTableError) as e: | ||
stdlib_tools.tsv_stdlib(fp) | ||
assert e.value.code == HTTPStatus.BAD_REQUEST | ||
|
||
def test_csv_stdlib_other_exception_raises(self): | ||
with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp: | ||
with pytest.raises(TabularRendererError) as e: | ||
stdlib_tools.tsv_stdlib(fp) | ||
assert e.value.code == HTTPStatus.BAD_REQUEST |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seek
just returns the offset the pointer was advanced to. This should probably beread
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Complete. PR308