-
Notifications
You must be signed in to change notification settings - Fork 122
Fix: Allow \r in unquoted fields when row separator doesn't contain \r #346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b3f7932
440c545
196efe4
5b8f693
c237450
dd88061
750531a
f323873
cb1084d
f2a2f8f
313f849
b455a09
9be946f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -675,7 +675,10 @@ def prepare_quoted | |
def prepare_unquoted | ||
return if @quote_character.nil? | ||
|
||
no_unquoted_values = "\r\n".encode(@encoding) | ||
# Only exclude characters that are actually part of the row separator | ||
# instead of hardcoding "\r\n" | ||
row_separator_chars = @row_separator.chars.map { |c| Regexp.escape(c) }.join | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed! Using |
||
no_unquoted_values = row_separator_chars.encode(@encoding) | ||
no_unquoted_values << @escaped_first_column_separator | ||
unless @liberal_parsing | ||
no_unquoted_values << @escaped_quote_character | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -139,27 +139,24 @@ def test_non_regex_edge_cases | |||||
end | ||||||
|
||||||
def test_malformed_csv_cr_first_line | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you update test name? |
||||||
error = assert_raise(CSV::MalformedCSVError) do | ||||||
CSV.parse_line("1,2\r,3", row_sep: "\n") | ||||||
end | ||||||
assert_equal("Unquoted fields do not allow new line <\"\\r\"> in line 1.", | ||||||
error.message) | ||||||
# With the fix for accepting \r without quote when row separator doesn't include \r, | ||||||
# this should now parse successfully when row_sep is "\n" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need this comment with suitable test name.
Suggested change
|
||||||
result = CSV.parse_line("1,2\r,3", row_sep: "\n") | ||||||
assert_equal(["1", "2\r", "3"], result) | ||||||
end | ||||||
|
||||||
def test_malformed_csv_cr_middle_line | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||||||
csv = <<-CSV | ||||||
line,1,abc | ||||||
line,2,"def\nghi" | ||||||
|
||||||
line,4,some\rjunk | ||||||
line,5,jkl | ||||||
CSV | ||||||
|
||||||
error = assert_raise(CSV::MalformedCSVError) do | ||||||
CSV.parse(csv) | ||||||
end | ||||||
assert_equal("Unquoted fields do not allow new line <\"\\r\"> in line 4.", | ||||||
error.message) | ||||||
# With the fix for accepting \r without quote when row separator doesn't include \r, | ||||||
# this should now parse successfully (default row_sep is "\n") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||||||
csv = "line,1,abc\nline,2,\"def\nghi\"\nline,4,some\rjunk\nline,5,jkl\n" | ||||||
result = CSV.parse(csv) | ||||||
expected = [ | ||||||
["line", "1", "abc"], | ||||||
["line", "2", "def\nghi"], | ||||||
["line", "4", "some\rjunk"], | ||||||
["line", "5", "jkl"] | ||||||
] | ||||||
assert_equal(expected, result) | ||||||
end | ||||||
|
||||||
def test_malformed_csv_unclosed_quote | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,12 +5,11 @@ | |
|
||
class TestCSVParseInvalid < Test::Unit::TestCase | ||
def test_no_column_mixed_new_lines | ||
error = assert_raise(CSV::MalformedCSVError) do | ||
CSV.parse("\n" + | ||
"\r") | ||
end | ||
assert_equal("New line must be <\"\\n\"> not <\"\\r\"> in line 2.", | ||
error.message) | ||
# With the fix for accepting \r without quote when row separator doesn't include \r, | ||
# this should now parse successfully (default row_sep is "\n") | ||
result = CSV.parse("\n" + "\r") | ||
# This should parse as an empty first row and a second row with just "\r" | ||
assert_equal([[], ["\r"]], result) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test case is for invalid cases. |
||
end | ||
|
||
def test_ignore_invalid_line | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,68 @@ | ||||||
# -*- coding: utf-8 -*- | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need this with recent Ruby.
Suggested change
|
||||||
# frozen_string_literal: false | ||||||
|
||||||
require_relative "../helper" | ||||||
|
||||||
class TestCSVParseUnquotedCR < Test::Unit::TestCase | ||||||
extend DifferentOFS | ||||||
|
||||||
def test_accept_cr_in_unquoted_field_when_row_separator_is_lf_only | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we simplify test name? We don't need to use English here.
Suggested change
|
||||||
# When row separator is just \n, \r should be allowed in unquoted fields | ||||||
data = "field1,field\rwith\rcr,field3\nrow2,data,here\n" | ||||||
expected = [ | ||||||
["field1", "field\rwith\rcr", "field3"], | ||||||
["row2", "data", "here"] | ||||||
] | ||||||
assert_equal(expected, CSV.parse(data, row_sep: "\n")) | ||||||
end | ||||||
|
||||||
def test_accept_cr_in_unquoted_field_when_row_separator_is_custom | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# When row separator is custom (like "|"), \r should be allowed in unquoted fields | ||||||
data = "field1,field\rwith\rcr,field3|row2,data,here|" | ||||||
expected = [ | ||||||
["field1", "field\rwith\rcr", "field3"], | ||||||
["row2", "data", "here"] | ||||||
] | ||||||
assert_equal(expected, CSV.parse(data, row_sep: "|")) | ||||||
end | ||||||
|
||||||
def test_reject_cr_when_row_separator_includes_cr | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# When row separator includes \r (like \r\n), \r should still be rejected in unquoted fields | ||||||
data = "field1,field2,field3\r\nrow2,data,here\r\n" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you use invalid data something like |
||||||
expected = [ | ||||||
["field1", "field2", "field3"], | ||||||
["row2", "data", "here"] | ||||||
] | ||||||
assert_equal(expected, CSV.parse(data, row_sep: "\r\n")) | ||||||
end | ||||||
|
||||||
def test_reject_cr_when_row_separator_is_cr_only | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# When row separator is just \r, \r should be rejected in unquoted fields | ||||||
data = "field1,field2,field3\rrow2,data,here\r" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||||||
expected = [ | ||||||
["field1", "field2", "field3"], | ||||||
["row2", "data", "here"] | ||||||
] | ||||||
assert_equal(expected, CSV.parse(data, row_sep: "\r")) | ||||||
end | ||||||
|
||||||
def test_liberal_parsing_with_custom_row_separator | ||||||
# Test liberal parsing mode with custom row separator | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this comment? I feel that test name is described well. So I feel that this is redundant. |
||||||
data = "field1,field\rwith\rcr,field3|row2,data,here|" | ||||||
expected = [ | ||||||
["field1", "field\rwith\rcr", "field3"], | ||||||
["row2", "data", "here"] | ||||||
] | ||||||
assert_equal(expected, CSV.parse(data, row_sep: "|", liberal_parsing: true)) | ||||||
end | ||||||
|
||||||
def test_quoted_fields_with_cr_and_custom_row_separator | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# Quoted fields should always allow \r regardless of row separator | ||||||
data = "field1,\"field\rwith\rcr\",field3|row2,data,here|" | ||||||
expected = [ | ||||||
["field1", "field\rwith\rcr", "field3"], | ||||||
["row2", "data", "here"] | ||||||
] | ||||||
assert_equal(expected, CSV.parse(data, row_sep: "|")) | ||||||
end | ||||||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that we can remove this comment.
I feel that it's useful for commit message (the PR description in this repository) because it describes why we do this change but it may not be useful for readers of new code. (Nobody will not try using
"\r\n"
here.)