From bf96760e8f9c886cafe465bc4f6d2d9333c915f7 Mon Sep 17 00:00:00 2001 From: Pieter Pas Date: Thu, 29 Apr 2021 01:41:39 +0200 Subject: [PATCH 1/5] Crash when printing Unicode to redirected cout Add failing tests --- tests/test_iostream.py | 80 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/test_iostream.py b/tests/test_iostream.py index 6d493beda3..213569a8d3 100644 --- a/tests/test_iostream.py +++ b/tests/test_iostream.py @@ -68,6 +68,86 @@ def test_captured_large_string(capsys): assert stdout == msg assert stderr == "" +def test_captured_utf8_2byte_offset0(capsys): + msg = "\u07FF" + msg = "" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_2byte_offset1(capsys): + msg = "\u07FF" + msg = "1" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_3byte_offset0(capsys): + msg = "\uFFFF" + msg = "" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_3byte_offset1(capsys): + msg = "\uFFFF" + msg = "1" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_3byte_offset2(capsys): + msg = "\uFFFF" + msg = "12" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_4byte_offset0(capsys): + msg = "\U0010FFFF" + msg = "" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_4byte_offset1(capsys): + msg = "\U0010FFFF" + msg = "1" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_4byte_offset2(capsys): + msg = "\U0010FFFF" + msg = "12" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" + +def test_captured_utf8_4byte_offset3(capsys): + msg = "\U0010FFFF" + msg = "123" + msg * (1024 // len(msg) + 1) + + m.captured_output_default(msg) + stdout, stderr = capsys.readouterr() + assert stdout == msg + assert stderr == "" def test_guard_capture(capsys): msg = "I've been redirected to Python, I hope!" From 4d655b0c1ba99f7f69a117233fce7ceaf27dba08 Mon Sep 17 00:00:00 2001 From: Pieter Pas Date: Thu, 29 Apr 2021 01:58:29 +0200 Subject: [PATCH 2/5] Fix Unicode crashes redirected cout --- include/pybind11/iostream.h | 61 ++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/include/pybind11/iostream.h b/include/pybind11/iostream.h index 9dee755431..cd25498a41 100644 --- a/include/pybind11/iostream.h +++ b/include/pybind11/iostream.h @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) PYBIND11_NAMESPACE_BEGIN(detail) @@ -38,6 +41,47 @@ class pythonbuf : public std::streambuf { return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof(); } + // Computes how many bytes at the end of the buffer are part of an + // incomplete sequence of UTF-8 bytes. + // Precondition: pbase() < pptr() + size_t utf8_remainder() const { + const auto rbase = std::reverse_iterator(pbase()); + const auto rpptr = std::reverse_iterator(pptr()); + auto is_ascii = [](char c) { + return (static_cast(c) & 0x80) == 0x00; + }; + auto is_leading = [](char c) { + return (static_cast(c) & 0xC0) == 0xC0; + }; + auto is_leading_2b = [](char c) { + return static_cast(c) <= 0xDF; + }; + auto is_leading_3b = [](char c) { + return static_cast(c) <= 0xEF; + }; + // If the last character is ASCII, there are no incomplete code points + if (is_ascii(*rpptr)) + return 0; + // Otherwise, work back from the end of the buffer and find the first + // UTF-8 leading byte + const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase; + const auto leading = std::find_if(rpptr, rpend, is_leading); + const auto dist = static_cast(leading - rpptr); + size_t remainder = 0; + + if (dist == 0) + remainder = 1; // 1-byte code point is impossible + else if (dist == 1) + remainder = is_leading_2b(*leading) ? 0 : dist + 1; + else if (dist == 2) + remainder = is_leading_3b(*leading) ? 0 : dist + 1; + // else if (dist >= 3), at least 4 bytes before encountering an UTF-8 + // leading byte, either no remainder or invalid UTF-8. + // Invalid UTF-8 will cause an exception later when converting + // to a Python string, so that's not handled here. + return remainder; + } + // This function must be non-virtual to be called in a destructor. If the // rare MSVC test failure shows up with this version, then this should be // simplified to a fully qualified call. @@ -48,13 +92,22 @@ class pythonbuf : public std::streambuf { gil_scoped_acquire tmp; // This subtraction cannot be negative, so dropping the sign. - str line(pbase(), static_cast(pptr() - pbase())); + auto size = static_cast(pptr() - pbase()); + size_t remainder = utf8_remainder(); + + if (size > remainder) { + str line(pbase(), size - remainder); + pywrite(line); + pyflush(); + } - pywrite(line); - pyflush(); + // Placed inside gil_scoped_aquire as a mutex to avoid a race. - // Placed inside gil_scoped_aquire as a mutex to avoid a race + // Copy the remainder at the end of the buffer to the beginning: + if (remainder > 0) + std::memmove(pbase(), pptr() - remainder, remainder); setp(pbase(), epptr()); + pbump(static_cast(remainder)); } } From 9b47409007b6b30687c40e2c643d8710e932fdab Mon Sep 17 00:00:00 2001 From: Pieter Pas Date: Fri, 30 Apr 2021 00:40:20 +0200 Subject: [PATCH 3/5] pythonbuf::utf8_remainder check end iterator --- include/pybind11/iostream.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/pybind11/iostream.h b/include/pybind11/iostream.h index cd25498a41..b3d6e78273 100644 --- a/include/pybind11/iostream.h +++ b/include/pybind11/iostream.h @@ -66,6 +66,8 @@ class pythonbuf : public std::streambuf { // UTF-8 leading byte const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase; const auto leading = std::find_if(rpptr, rpend, is_leading); + if (leading == rbase) + return 0; const auto dist = static_cast(leading - rpptr); size_t remainder = 0; From 45a91d828f948b56e416203253527b96863a0daf Mon Sep 17 00:00:00 2001 From: Pieter Pas Date: Fri, 30 Apr 2021 00:43:49 +0200 Subject: [PATCH 4/5] Remove trailing whitespace and formatting iostream --- include/pybind11/iostream.h | 6 +++--- tests/test_iostream.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/pybind11/iostream.h b/include/pybind11/iostream.h index b3d6e78273..94d2ff2b73 100644 --- a/include/pybind11/iostream.h +++ b/include/pybind11/iostream.h @@ -41,7 +41,7 @@ class pythonbuf : public std::streambuf { return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof(); } - // Computes how many bytes at the end of the buffer are part of an + // Computes how many bytes at the end of the buffer are part of an // incomplete sequence of UTF-8 bytes. // Precondition: pbase() < pptr() size_t utf8_remainder() const { @@ -71,7 +71,7 @@ class pythonbuf : public std::streambuf { const auto dist = static_cast(leading - rpptr); size_t remainder = 0; - if (dist == 0) + if (dist == 0) remainder = 1; // 1-byte code point is impossible else if (dist == 1) remainder = is_leading_2b(*leading) ? 0 : dist + 1; @@ -79,7 +79,7 @@ class pythonbuf : public std::streambuf { remainder = is_leading_3b(*leading) ? 0 : dist + 1; // else if (dist >= 3), at least 4 bytes before encountering an UTF-8 // leading byte, either no remainder or invalid UTF-8. - // Invalid UTF-8 will cause an exception later when converting + // Invalid UTF-8 will cause an exception later when converting // to a Python string, so that's not handled here. return remainder; } diff --git a/tests/test_iostream.py b/tests/test_iostream.py index 213569a8d3..e2b74d01cb 100644 --- a/tests/test_iostream.py +++ b/tests/test_iostream.py @@ -68,6 +68,7 @@ def test_captured_large_string(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_2byte_offset0(capsys): msg = "\u07FF" msg = "" + msg * (1024 // len(msg) + 1) @@ -77,6 +78,7 @@ def test_captured_utf8_2byte_offset0(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_2byte_offset1(capsys): msg = "\u07FF" msg = "1" + msg * (1024 // len(msg) + 1) @@ -86,6 +88,7 @@ def test_captured_utf8_2byte_offset1(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_3byte_offset0(capsys): msg = "\uFFFF" msg = "" + msg * (1024 // len(msg) + 1) @@ -95,6 +98,7 @@ def test_captured_utf8_3byte_offset0(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_3byte_offset1(capsys): msg = "\uFFFF" msg = "1" + msg * (1024 // len(msg) + 1) @@ -104,6 +108,7 @@ def test_captured_utf8_3byte_offset1(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_3byte_offset2(capsys): msg = "\uFFFF" msg = "12" + msg * (1024 // len(msg) + 1) @@ -113,6 +118,7 @@ def test_captured_utf8_3byte_offset2(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_4byte_offset0(capsys): msg = "\U0010FFFF" msg = "" + msg * (1024 // len(msg) + 1) @@ -122,6 +128,7 @@ def test_captured_utf8_4byte_offset0(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_4byte_offset1(capsys): msg = "\U0010FFFF" msg = "1" + msg * (1024 // len(msg) + 1) @@ -131,6 +138,7 @@ def test_captured_utf8_4byte_offset1(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_4byte_offset2(capsys): msg = "\U0010FFFF" msg = "12" + msg * (1024 // len(msg) + 1) @@ -140,6 +148,7 @@ def test_captured_utf8_4byte_offset2(capsys): assert stdout == msg assert stderr == "" + def test_captured_utf8_4byte_offset3(capsys): msg = "\U0010FFFF" msg = "123" + msg * (1024 // len(msg) + 1) @@ -149,6 +158,7 @@ def test_captured_utf8_4byte_offset3(capsys): assert stdout == msg assert stderr == "" + def test_guard_capture(capsys): msg = "I've been redirected to Python, I hope!" m.guard_output(msg) From 6f88a5a98480e729f7b92547db45b73ec2f7a07c Mon Sep 17 00:00:00 2001 From: Pieter Pas Date: Sun, 2 May 2021 13:26:52 +0200 Subject: [PATCH 5/5] Avoid buffer overflow if ostream redirect races This doesn't solve the actual race, but at least it now has a much lower probability of reading past the end of the buffer even when data races do occur. --- include/pybind11/iostream.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/include/pybind11/iostream.h b/include/pybind11/iostream.h index 94d2ff2b73..7c1c718b02 100644 --- a/include/pybind11/iostream.h +++ b/include/pybind11/iostream.h @@ -88,11 +88,10 @@ class pythonbuf : public std::streambuf { // rare MSVC test failure shows up with this version, then this should be // simplified to a fully qualified call. int _sync() { - if (pbase() != pptr()) { - - { - gil_scoped_acquire tmp; - + if (pbase() != pptr()) { // If buffer is not empty + gil_scoped_acquire tmp; + // Placed inside gil_scoped_acquire as a mutex to avoid a race. + if (pbase() != pptr()) { // Check again under the lock // This subtraction cannot be negative, so dropping the sign. auto size = static_cast(pptr() - pbase()); size_t remainder = utf8_remainder(); @@ -103,15 +102,12 @@ class pythonbuf : public std::streambuf { pyflush(); } - // Placed inside gil_scoped_aquire as a mutex to avoid a race. - // Copy the remainder at the end of the buffer to the beginning: if (remainder > 0) std::memmove(pbase(), pptr() - remainder, remainder); setp(pbase(), epptr()); pbump(static_cast(remainder)); } - } return 0; }