From b4e382a8a174e02b526453e4d26e31d1eef29e6e Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 5 Oct 2021 12:08:59 +0200 Subject: [PATCH 1/4] Buffer write calls to _compression.BaseStream objects This makes writing in small units such as lines or FASTQ records much faster --- src/xopen/__init__.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index bf17935..181a524 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -29,6 +29,7 @@ import subprocess import tempfile import time +import _compression from abc import ABC, abstractmethod from subprocess import Popen, PIPE, DEVNULL from typing import Optional, TextIO, AnyStr, IO, List, Set @@ -732,10 +733,19 @@ def xopen( detected_format = _detect_format_from_content(filename) if detected_format == "gz": - return _open_gz(filename, mode, compresslevel, threads) + opened_file = _open_gz(filename, mode, compresslevel, threads) elif detected_format == "xz": - return _open_xz(filename, mode) + opened_file = _open_xz(filename, mode) elif detected_format == "bz2": - return _open_bz2(filename, mode, threads) + opened_file = _open_bz2(filename, mode, threads) else: - return open(filename, mode) + opened_file = open(filename, mode) + + # The "write" method for GzipFile is very costly. Lots of python calls are + # made. To a lesser extent this is true for LzmaFile and BZ2File. By + # putting a buffer in between, the expensive write method is called much + # less. The effect is very noticable when writing small units such as lines + # or FASTQ records. + if isinstance(opened_file, _compression.BaseStream) and "w" in mode: + opened_file = io.BufferedWriter(opened_file) + return opened_file From 5ede0f9c9566e7a6b23280d802cf1f81f5c70e05 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 5 Oct 2021 12:18:15 +0200 Subject: [PATCH 2/4] Fix tests and linting --- src/xopen/__init__.py | 2 +- tests/test_xopen.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 181a524..b228066 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -747,5 +747,5 @@ def xopen( # less. The effect is very noticable when writing small units such as lines # or FASTQ records. if isinstance(opened_file, _compression.BaseStream) and "w" in mode: - opened_file = io.BufferedWriter(opened_file) + opened_file = io.BufferedWriter(opened_file) # type: ignore return opened_file diff --git a/tests/test_xopen.py b/tests/test_xopen.py index d9da91a..8055199 100644 --- a/tests/test_xopen.py +++ b/tests/test_xopen.py @@ -532,14 +532,16 @@ def test_write_no_threads(tmpdir, ext): klass = klasses[ext] path = str(tmpdir.join(f"out.{ext}")) with xopen(path, "wb", threads=0) as f: - assert isinstance(f, klass), f + assert isinstance(f, io.BufferedWriter) + if ext: + assert isinstance(f.raw, klass), f def test_write_gzip_no_threads_no_isal(tmpdir, xopen_without_igzip): import gzip path = str(tmpdir.join("out.gz")) with xopen_without_igzip(path, "wb", threads=0) as f: - assert isinstance(f, gzip.GzipFile), f + assert isinstance(f.raw, gzip.GzipFile), f def test_write_stdout(): From 3811b1578f2b5b99b5200a72886c77684f11b9b4 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 11 Oct 2021 09:20:26 +0200 Subject: [PATCH 3/4] Make compression check code more explicit --- src/xopen/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index b228066..270a9b2 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -29,7 +29,6 @@ import subprocess import tempfile import time -import _compression from abc import ABC, abstractmethod from subprocess import Popen, PIPE, DEVNULL from typing import Optional, TextIO, AnyStr, IO, List, Set @@ -744,8 +743,10 @@ def xopen( # The "write" method for GzipFile is very costly. Lots of python calls are # made. To a lesser extent this is true for LzmaFile and BZ2File. By # putting a buffer in between, the expensive write method is called much - # less. The effect is very noticable when writing small units such as lines - # or FASTQ records. - if isinstance(opened_file, _compression.BaseStream) and "w" in mode: + # less. The effect is very noticeable when writing small units such as + # lines or FASTQ records. + if (isinstance(opened_file, gzip.GzipFile) or + isinstance(opened_file, bz2.BZ2File) or + isinstance(opened_file, lzma.LZMAFile)) and "w" in mode: opened_file = io.BufferedWriter(opened_file) # type: ignore return opened_file From ad773ca99d08c6c8db0bb44fa2e8f4dd3f58d28e Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 11 Oct 2021 11:35:07 +0200 Subject: [PATCH 4/4] Use tupled isinstance --- src/xopen/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 270a9b2..ef30bcb 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -745,8 +745,7 @@ def xopen( # putting a buffer in between, the expensive write method is called much # less. The effect is very noticeable when writing small units such as # lines or FASTQ records. - if (isinstance(opened_file, gzip.GzipFile) or - isinstance(opened_file, bz2.BZ2File) or - isinstance(opened_file, lzma.LZMAFile)) and "w" in mode: + if (isinstance(opened_file, (gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile)) + and "w" in mode): opened_file = io.BufferedWriter(opened_file) # type: ignore return opened_file