diff --git a/mfr/extensions/tabular/libs/xlrd_tools.py b/mfr/extensions/tabular/libs/xlrd_tools.py index fe4b19742..8a5983f6d 100644 --- a/mfr/extensions/tabular/libs/xlrd_tools.py +++ b/mfr/extensions/tabular/libs/xlrd_tools.py @@ -1,13 +1,15 @@ -import xlrd +import uuid from collections import OrderedDict -from ..exceptions import TableTooBigError -from ..utilities import header_population +import xlrd + from mfr.extensions.tabular.compat import range, basestring +from mfr.extensions.tabular.utilities import header_population +from mfr.extensions.tabular.exceptions import TableTooBigError -def xlsx_xlrd(fp): - """Read and convert a xlsx file to JSON format using the xlrd library +def xlsx_xlrd(fp, max_iterations=5000): + """Read and convert a xlsx file to JSON format using the xlrd library. :param fp: File pointer object :return: tuple of table headers and data """ @@ -34,6 +36,30 @@ def xlsx_xlrd(fp): for index, value in enumerate(fields) ] + # Duplicate header fields create errors, we need to rename them + duplicate_fields = set([x for x in fields if fields.count(x) > 1]) + if len(duplicate_fields): + counts = {} + for field in duplicate_fields: + counts[field] = 1 + + for i, field in enumerate(fields): + if field in duplicate_fields: + increased_name = field + ' ({})'.format(counts[field]) + + # this triggers if you try to rename a header, and that new name + # already exists in fields. it will then increment to look for the + # next available name. + iteration = 0 + while increased_name in fields: + iteration += 1 + if iteration > max_iterations: + increased_name = field + ' ({})'.format(uuid.uuid4()) + else: + counts[field] += 1 + increased_name = field + ' ({})'.format(counts[field]) + + fields[i] = increased_name data = [] for i in range(1, sheet.nrows): row = [] diff --git a/tests/extensions/tabular/files/test_duplicate.xlsx b/tests/extensions/tabular/files/test_duplicate.xlsx new file mode 100644 index 000000000..a8151ab4d Binary files /dev/null and b/tests/extensions/tabular/files/test_duplicate.xlsx differ diff --git a/tests/extensions/tabular/files/test_duplicate_uuid.xlsx b/tests/extensions/tabular/files/test_duplicate_uuid.xlsx new file mode 100644 index 000000000..f10abfe1e Binary files /dev/null and b/tests/extensions/tabular/files/test_duplicate_uuid.xlsx differ diff --git a/tests/extensions/tabular/test_xlsx_tools.py b/tests/extensions/tabular/test_xlsx_tools.py index 1cecf369e..fa8e09456 100644 --- a/tests/extensions/tabular/test_xlsx_tools.py +++ b/tests/extensions/tabular/test_xlsx_tools.py @@ -1,5 +1,7 @@ import os +import pytest + from mfr.extensions.tabular.libs import xlrd_tools BASE = os.path.dirname(os.path.abspath(__file__)) @@ -17,4 +19,34 @@ def test_xlsx_xlrd(self): assert sheet[0][2] == {'field': 'three', 'id': 'three', 'name': 'three', 'sortable': True} assert sheet[1][0] == {'one': 'a', 'two': 'b', 'three': 'c'} assert sheet[1][1] == {'one': 1.0, 'two': 2.0, 'three': 3.0} - assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98','three': u'set\\x99'} + assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98', 'three': u'set\\x99'} + + def test_xlsx_xlrd_duplicate_fields(self): + with open(os.path.join(BASE, 'files', 'test_duplicate.xlsx')) as fp: + sheets = xlrd_tools.xlsx_xlrd(fp) + + sheet = sheets.popitem()[1] + assert sheet[0][0] == {'id': 'Name', 'name': 'Name', 'field': 'Name', 'sortable': True} + assert sheet[0][1] == {'id': 'Dup (1)', 'name': 'Dup (1)', + 'field': 'Dup (1)', 'sortable': True} + assert sheet[0][2] == {'id': 'Dup (2)', 'name': 'Dup (2)', + 'field': 'Dup (2)', 'sortable': True} + assert sheet[0][3] == {'id': 'Dup (3)', 'name': 'Dup (3)', + 'field': 'Dup (3)', 'sortable': True} + assert sheet[0][4] == {'id': 'Dup (4)', 'name': 'Dup (4)', + 'field': 'Dup (4)', 'sortable': True} + assert sheet[0][5] == {'id': 'Not Dup', 'name': 'Not Dup', + 'field': 'Not Dup', 'sortable': True} + assert sheet[1][0] == {'Name': 1.0, 'Dup (1)': 2.0, 'Dup (2)': 3.0, + 'Dup (3)': 4.0, 'Dup (4)': 5.0, 'Not Dup': 6.0} + + def test_xlsx_xlrd_duplicate_fields_handle_naming(self): + with open(os.path.join(BASE, 'files', 'test_duplicate_uuid.xlsx')) as fp: + sheets = xlrd_tools.xlsx_xlrd(fp, max_iterations=10) + + sheet = sheets.popitem()[1] + + # this if you raise max iterations, it will be named dup (13) instead of dup () + assert sheet[0][1]['field'] != 'dup (13)' + # using `len` is an easy way to see the uuid has been appended + assert len(sheet[0][1]['field']) > 24