CenterForOpenScience · AddisonSchiller · Oct 11, 2017 · Nov 16, 2017 · Nov 22, 2017 · felliott
diff --git a/mfr/extensions/tabular/libs/__init__.py b/mfr/extensions/tabular/libs/__init__.py
@@ -8,6 +8,11 @@ def csv_stdlib():
     return csv_stdlib
 
 
+def tsv_stdlib():
+    from ..libs.stdlib_tools import tsv_stdlib
+    return tsv_stdlib
+
+
 def csv_pandas():
     from ..libs.panda_tools import csv_pandas
     return csv_pandas

diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py
@@ -1,57 +1,91 @@
 import re
 import csv
+from http import HTTPStatus
 
-from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
 from mfr.extensions.tabular import utilities
+from mfr.extensions.tabular.exceptions import (EmptyTableError,
+                                               TabularRendererError)
 
 
 def csv_stdlib(fp):
-    """Read and convert a csv file to JSON format using the python standard library
-    :param fp: File pointer object
-    :return: tuple of table headers and data
-    """
-    data = fp.read(2048)
+    data = fp.seek(2048)
     fp.seek(0)
+    # set the dialect instead of sniffing for it.
+    # sniffing can cause things like spaces or characters to be the delimiter
+    dialect = csv.excel
+    try:
+        _set_dialect_quote_attrs(dialect, data)
+    except:
+        # if this errors it is not an exception
+        pass
+
+    reader = csv.DictReader(fp, dialect=dialect)
+    return parse_stdlib(reader)
 
+def tsv_stdlib(fp):
+    data = fp.seek(2048)
+    fp.seek(0)
+    # set the dialect instead of sniffing for it.
+    # sniffing can cause things like spaces or characters to be the delimiter
+    dialect = csv.excel_tab
     try:
-        dialect = csv.Sniffer().sniff(data)
-    except csv.Error:
-        dialect = csv.excel
-    else:
         _set_dialect_quote_attrs(dialect, data)
+    except:
+        # if this errors it is not an exception
+        pass
 
     reader = csv.DictReader(fp, dialect=dialect)
+    return parse_stdlib(reader)
+
+def parse_stdlib(reader):
+    """Read and convert a csv like file to JSON format using the python standard library
+    :param fp: File pointer object
+    :return: tuple of table headers and data
+    """
     columns = []
     # update the reader field names to avoid duplicate column names when performing row extraction
-    for idx, fieldname in enumerate(reader.fieldnames or []):
-        column_count = sum(1 for column in columns if fieldname == column['name'])
-        if column_count:
-            unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
-            reader.fieldnames[idx] = unique_fieldname
-        else:
-            unique_fieldname = fieldname
-        columns.append({
-            'id': unique_fieldname,
-            'field': unique_fieldname,
-            'name': fieldname,
-            'sortable': True,
-        })
-
     try:
+        for idx, fieldname in enumerate(reader.fieldnames or []):
+            column_count = sum(1 for column in columns if fieldname == column['name'])
+            if column_count:
+                unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
+                reader.fieldnames[idx] = unique_fieldname
+            else:
+                unique_fieldname = fieldname
+            columns.append({
+                'id': unique_fieldname,
+                'field': unique_fieldname,
+                'name': fieldname,
+                'sortable': True,
+            })
+
         rows = [row for row in reader]
     except csv.Error as e:
         if any("field larger than field limit" in errorMsg for errorMsg in e.args):
             raise TabularRendererError(
                 'This file contains a field too large to render. '
                 'Please download and view it locally.',
-                code=400,
+                code=HTTPStatus.BAD_REQUEST,
                 extension='csv',
             ) from e
         else:
-            raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e
+            raise TabularRendererError('Cannot render file as csv/tsv. '
+                                       'The file may be empty or corrupt',
+                                       code=HTTPStatus.BAD_REQUEST,
+                                       extension='csv') from e
+
+    # Outside other except because the `if any` line causes more errors to be raised
+    # on certain exceptions
+    except Exception as e:
+        raise TabularRendererError('Cannot render file as csv/tsv. '
+                           'The file may be empty or corrupt',
+                           code=HTTPStatus.BAD_REQUEST,
+                           extension='csv') from e
 
     if not columns and not rows:
-        raise EmptyTableError('Table empty or corrupt.', extension='csv')
+        raise EmptyTableError('Cannot render file as csv/tsv. '
+                              'The file may be empty or corrupt',
+                              code=HTTPStatus.BAD_REQUEST, extension='csv')
 
     return {'Sheet 1': (columns, rows)}
 

diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py
@@ -10,7 +10,7 @@
 
 LIBS = config.get('LIBS', {
     '.csv': [libs.csv_stdlib],
-    '.tsv': [libs.csv_stdlib],
+    '.tsv': [libs.tsv_stdlib],
     '.gsheet': [libs.xlsx_xlrd],
     '.xlsx': [libs.xlsx_xlrd],
     '.xls': [libs.xlsx_xlrd],

diff --git a/tests/extensions/tabular/files/invalid_null.csv b/tests/extensions/tabular/files/invalid_null.csv
diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py
@@ -0,0 +1,58 @@
+import os
+from http import HTTPStatus
+from collections import OrderedDict
+
+import pytest
+
+from mfr.extensions.tabular.libs import stdlib_tools
+from mfr.extensions.tabular.exceptions import(EmptyTableError,
+                                              TabularRendererError)
+
+BASE = os.path.dirname(os.path.abspath(__file__))
+
+
+class TestTabularStdlibTools:
+
+    def test_csv_stdlib(self):
+        with open(os.path.join(BASE, 'files', 'test.csv')) as fp:
+            sheets = stdlib_tools.csv_stdlib(fp)
+
+        sheet = sheets.popitem()[1]
+        assert sheet[0] == [
+            {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
+            {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
+            {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
+        ]
+        assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')])
+        assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])
+
+    def test_tsv_stdlib(self):
+        with open(os.path.join(BASE, 'files', 'test.tsv')) as fp:
+            sheets = stdlib_tools.tsv_stdlib(fp)
+
+        sheet = sheets.popitem()[1]
+        assert sheet[0] == [
+            {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
+            {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
+            {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
+        ]
+        assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')])
+        assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])
+
+    def test_tsv_stdlib_exception_raises(self):
+        with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp:
+            with pytest.raises(EmptyTableError) as e:
+                stdlib_tools.tsv_stdlib(fp)
+                assert e.value.code == HTTPStatus.BAD_REQUEST
+
+    def test_csv_stdlib_exception_raises(self):
+        with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp:
+            with pytest.raises(EmptyTableError) as e:
+                stdlib_tools.tsv_stdlib(fp)
+                assert e.value.code == HTTPStatus.BAD_REQUEST
+
+    def test_csv_stdlib_other_exception_raises(self):
+        with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp:
+            with pytest.raises(TabularRendererError) as e:
+                stdlib_tools.tsv_stdlib(fp)
+                assert e.value.code == HTTPStatus.BAD_REQUEST