Skip to content

[SVCS-531] Separate csv and tsv function and remove use of sniff #285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mfr/extensions/tabular/libs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ def csv_stdlib():
return csv_stdlib


def tsv_stdlib():
from ..libs.stdlib_tools import tsv_stdlib
return tsv_stdlib


def csv_pandas():
from ..libs.panda_tools import csv_pandas
return csv_pandas
Expand Down
88 changes: 61 additions & 27 deletions mfr/extensions/tabular/libs/stdlib_tools.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,91 @@
import re
import csv
from http import HTTPStatus

from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
from mfr.extensions.tabular import utilities
from mfr.extensions.tabular.exceptions import (EmptyTableError,
TabularRendererError)


def csv_stdlib(fp):
"""Read and convert a csv file to JSON format using the python standard library
:param fp: File pointer object
:return: tuple of table headers and data
"""
data = fp.read(2048)
data = fp.seek(2048)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seek just returns the offset the pointer was advanced to. This should probably be read.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Complete. PR308

fp.seek(0)
# set the dialect instead of sniffing for it.
# sniffing can cause things like spaces or characters to be the delimiter
dialect = csv.excel
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm. I'm not sure how I feel about this. I like that it solves the issue of really long lines, but it bugs me a bit that we're throwing out support for alternative delimiters. If we use the csv.excel dialect, do we still support tab- and pipe-delimited text? If not, can we document that in a comment, so we'll know what to fix if we encounter it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Complete. PR308

try:
_set_dialect_quote_attrs(dialect, data)
except:
# if this errors it is not an exception
pass

reader = csv.DictReader(fp, dialect=dialect)
return parse_stdlib(reader)

def tsv_stdlib(fp):
data = fp.seek(2048)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seek => read

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Complete. PR308

fp.seek(0)
# set the dialect instead of sniffing for it.
# sniffing can cause things like spaces or characters to be the delimiter
dialect = csv.excel_tab
try:
dialect = csv.Sniffer().sniff(data)
except csv.Error:
dialect = csv.excel
else:
_set_dialect_quote_attrs(dialect, data)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I've ever seen a tsv with quoting in it. Has anyone else? Maybe we leave quoting alone until it's reported as an issue.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Complete. PR308

except:
# if this errors it is not an exception
pass

reader = csv.DictReader(fp, dialect=dialect)
return parse_stdlib(reader)

def parse_stdlib(reader):
"""Read and convert a csv like file to JSON format using the python standard library
:param fp: File pointer object
:return: tuple of table headers and data
"""
columns = []
# update the reader field names to avoid duplicate column names when performing row extraction
for idx, fieldname in enumerate(reader.fieldnames or []):
column_count = sum(1 for column in columns if fieldname == column['name'])
if column_count:
unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
reader.fieldnames[idx] = unique_fieldname
else:
unique_fieldname = fieldname
columns.append({
'id': unique_fieldname,
'field': unique_fieldname,
'name': fieldname,
'sortable': True,
})

try:
for idx, fieldname in enumerate(reader.fieldnames or []):
column_count = sum(1 for column in columns if fieldname == column['name'])
if column_count:
unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
reader.fieldnames[idx] = unique_fieldname
else:
unique_fieldname = fieldname
columns.append({
'id': unique_fieldname,
'field': unique_fieldname,
'name': fieldname,
'sortable': True,
})

rows = [row for row in reader]
except csv.Error as e:
if any("field larger than field limit" in errorMsg for errorMsg in e.args):
raise TabularRendererError(
'This file contains a field too large to render. '
'Please download and view it locally.',
code=400,
code=HTTPStatus.BAD_REQUEST,
extension='csv',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since both the csv and tsv parser call this, can we make sure the correct extension is being passed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Complete. PR308

) from e
else:
raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e
raise TabularRendererError('Cannot render file as csv/tsv. '
'The file may be empty or corrupt',
code=HTTPStatus.BAD_REQUEST,
extension='csv') from e

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is identical to the error raised in the next stanza, could we just throw the error instead?

# Outside other except because the `if any` line causes more errors to be raised
# on certain exceptions
except Exception as e:
raise TabularRendererError('Cannot render file as csv/tsv. '
'The file may be empty or corrupt',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: indentation is weird here.

code=HTTPStatus.BAD_REQUEST,
extension='csv') from e

if not columns and not rows:
raise EmptyTableError('Table empty or corrupt.', extension='csv')
raise EmptyTableError('Cannot render file as csv/tsv. '
'The file may be empty or corrupt',
code=HTTPStatus.BAD_REQUEST, extension='csv')

return {'Sheet 1': (columns, rows)}

Expand Down
2 changes: 1 addition & 1 deletion mfr/extensions/tabular/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

LIBS = config.get('LIBS', {
'.csv': [libs.csv_stdlib],
'.tsv': [libs.csv_stdlib],
'.tsv': [libs.tsv_stdlib],
'.gsheet': [libs.xlsx_xlrd],
'.xlsx': [libs.xlsx_xlrd],
'.xls': [libs.xlsx_xlrd],
Expand Down
Binary file added tests/extensions/tabular/files/invalid_null.csv
Binary file not shown.
58 changes: 58 additions & 0 deletions tests/extensions/tabular/test_stdlib_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
from http import HTTPStatus
from collections import OrderedDict

import pytest

from mfr.extensions.tabular.libs import stdlib_tools
from mfr.extensions.tabular.exceptions import(EmptyTableError,
TabularRendererError)

BASE = os.path.dirname(os.path.abspath(__file__))


class TestTabularStdlibTools:

def test_csv_stdlib(self):
with open(os.path.join(BASE, 'files', 'test.csv')) as fp:
sheets = stdlib_tools.csv_stdlib(fp)

sheet = sheets.popitem()[1]
assert sheet[0] == [
{'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
{'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
{'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
]
assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')])
assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])

def test_tsv_stdlib(self):
with open(os.path.join(BASE, 'files', 'test.tsv')) as fp:
sheets = stdlib_tools.tsv_stdlib(fp)

sheet = sheets.popitem()[1]
assert sheet[0] == [
{'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
{'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
{'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
]
assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')])
assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])

def test_tsv_stdlib_exception_raises(self):
with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp:
with pytest.raises(EmptyTableError) as e:
stdlib_tools.tsv_stdlib(fp)
assert e.value.code == HTTPStatus.BAD_REQUEST

def test_csv_stdlib_exception_raises(self):
with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp:
with pytest.raises(EmptyTableError) as e:
stdlib_tools.tsv_stdlib(fp)
assert e.value.code == HTTPStatus.BAD_REQUEST

def test_csv_stdlib_other_exception_raises(self):
with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp:
with pytest.raises(TabularRendererError) as e:
stdlib_tools.tsv_stdlib(fp)
assert e.value.code == HTTPStatus.BAD_REQUEST