Skip to content

Commit 97115bf

Browse files
committed
fix(download): respect Content-Length when Content-Encoding is present
Per RFC 9110 § 8.6 the Content-Length header reflects the **encoded** size. The previous logic compared it to the decoded size, yielding false "Incomplete download" errors for gzip responses.
1 parent 308a8de commit 97115bf

File tree

4 files changed

+210
-169
lines changed

4 files changed

+210
-169
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
This document records all notable changes to [HTTPie](https://httpie.io).
44
This project adheres to [Semantic Versioning](https://semver.org/).
55

6+
## Unreleased
7+
8+
### Fixed
9+
- Respect `Content-Length` with `--download` when `Content-Encoding` is present to avoid false "Incomplete download" errors. ([#423](https://github.com/httpie/cli/issues/423))
10+
611
## [3.2.4](https://github.com/httpie/cli/compare/3.2.3...3.2.4) (2024-11-01)
712

813
- Fix default certs loading and unpin `requests`. ([#1596](https://github.com/httpie/cli/issues/1596))

docs/download.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Download mode
2+
3+
HTTPie's `--download` option saves response bodies to files. When a server
4+
returns a `Content-Encoding` (for example `gzip`), the `Content-Length` header
5+
is treated as the size of the encoded payload as defined in RFC 9110 § 8.6.
6+
HTTPie writes the body exactly as received and no longer compares the header to
7+
the post-decompression size.
8+

httpie/downloads.py

Lines changed: 71 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Download mode implementation.
33
44
"""
5+
56
import mimetypes
67
import os
78
import re
@@ -12,10 +13,9 @@
1213

1314
import requests
1415

16+
from .context import Environment
1517
from .models import HTTPResponse, OutputOptions
1618
from .output.streams import RawStream
17-
from .context import Environment
18-
1919

2020
PARTIAL_CONTENT = 206
2121

@@ -37,24 +37,23 @@ def parse_content_range(content_range: str, resumed_from: int) -> int:
3737
3838
"""
3939
if content_range is None:
40-
raise ContentRangeError('Missing Content-Range')
40+
raise ContentRangeError("Missing Content-Range")
4141

4242
pattern = (
43-
r'^bytes (?P<first_byte_pos>\d+)-(?P<last_byte_pos>\d+)'
44-
r'/(\*|(?P<instance_length>\d+))$'
43+
r"^bytes (?P<first_byte_pos>\d+)-(?P<last_byte_pos>\d+)"
44+
r"/(\*|(?P<instance_length>\d+))$"
4545
)
4646
match = re.match(pattern, content_range)
4747

4848
if not match:
49-
raise ContentRangeError(
50-
f'Invalid Content-Range format {content_range!r}')
49+
raise ContentRangeError(f"Invalid Content-Range format {content_range!r}")
5150

5251
content_range_dict = match.groupdict()
53-
first_byte_pos = int(content_range_dict['first_byte_pos'])
54-
last_byte_pos = int(content_range_dict['last_byte_pos'])
52+
first_byte_pos = int(content_range_dict["first_byte_pos"])
53+
last_byte_pos = int(content_range_dict["last_byte_pos"])
5554
instance_length = (
56-
int(content_range_dict['instance_length'])
57-
if content_range_dict['instance_length']
55+
int(content_range_dict["instance_length"])
56+
if content_range_dict["instance_length"]
5857
else None
5958
)
6059

@@ -64,27 +63,24 @@ def parse_content_range(content_range: str, resumed_from: int) -> int:
6463
# last-byte-pos value, is invalid. The recipient of an invalid
6564
# byte-content-range- spec MUST ignore it and any content
6665
# transferred along with it."
67-
if (first_byte_pos > last_byte_pos
68-
or (instance_length is not None
69-
and instance_length <= last_byte_pos)):
70-
raise ContentRangeError(
71-
f'Invalid Content-Range returned: {content_range!r}')
66+
if first_byte_pos > last_byte_pos or (
67+
instance_length is not None and instance_length <= last_byte_pos
68+
):
69+
raise ContentRangeError(f"Invalid Content-Range returned: {content_range!r}")
7270

73-
if (first_byte_pos != resumed_from
74-
or (instance_length is not None
75-
and last_byte_pos + 1 != instance_length)):
71+
if first_byte_pos != resumed_from or (
72+
instance_length is not None and last_byte_pos + 1 != instance_length
73+
):
7674
# Not what we asked for.
7775
raise ContentRangeError(
78-
f'Unexpected Content-Range returned ({content_range!r})'
76+
f"Unexpected Content-Range returned ({content_range!r})"
7977
f' for the requested Range ("bytes={resumed_from}-")'
8078
)
8179

8280
return last_byte_pos + 1
8381

8482

85-
def filename_from_content_disposition(
86-
content_disposition: str
87-
) -> Optional[str]:
83+
def filename_from_content_disposition(content_disposition: str) -> Optional[str]:
8884
"""
8985
Extract and validate filename from a Content-Disposition header.
9086
@@ -94,28 +90,28 @@ def filename_from_content_disposition(
9490
"""
9591
# attachment; filename=jakubroztocil-httpie-0.4.1-20-g40bd8f6.tar.gz
9692

97-
msg = Message(f'Content-Disposition: {content_disposition}')
93+
msg = Message(f"Content-Disposition: {content_disposition}")
9894
filename = msg.get_filename()
9995
if filename:
10096
# Basic sanitation.
101-
filename = os.path.basename(filename).lstrip('.').strip()
97+
filename = os.path.basename(filename).lstrip(".").strip()
10298
if filename:
10399
return filename
104100

105101

106102
def filename_from_url(url: str, content_type: Optional[str]) -> str:
107-
fn = urlsplit(url).path.rstrip('/')
108-
fn = os.path.basename(fn) if fn else 'index'
109-
if '.' not in fn and content_type:
110-
content_type = content_type.split(';')[0]
111-
if content_type == 'text/plain':
103+
fn = urlsplit(url).path.rstrip("/")
104+
fn = os.path.basename(fn) if fn else "index"
105+
if "." not in fn and content_type:
106+
content_type = content_type.split(";")[0]
107+
if content_type == "text/plain":
112108
# mimetypes returns '.ksh'
113-
ext = '.txt'
109+
ext = ".txt"
114110
else:
115111
ext = mimetypes.guess_extension(content_type)
116112

117-
if ext == '.htm':
118-
ext = '.html'
113+
if ext == ".htm":
114+
ext = ".html"
119115

120116
if ext:
121117
fn += ext
@@ -136,12 +132,12 @@ def trim_filename(filename: str, max_len: int) -> str:
136132

137133
def get_filename_max_length(directory: str) -> int:
138134
max_len = 255
139-
if hasattr(os, 'pathconf') and 'PC_NAME_MAX' in os.pathconf_names:
140-
max_len = os.pathconf(directory, 'PC_NAME_MAX')
135+
if hasattr(os, "pathconf") and "PC_NAME_MAX" in os.pathconf_names:
136+
max_len = os.pathconf(directory, "PC_NAME_MAX")
141137
return max_len
142138

143139

144-
def trim_filename_if_needed(filename: str, directory='.', extra=0) -> str:
140+
def trim_filename_if_needed(filename: str, directory=".", extra=0) -> str:
145141
max_len = get_filename_max_length(directory) - extra
146142
if len(filename) > max_len:
147143
filename = trim_filename(filename, max_len)
@@ -151,7 +147,7 @@ def trim_filename_if_needed(filename: str, directory='.', extra=0) -> str:
151147
def get_unique_filename(filename: str, exists=os.path.exists) -> str:
152148
attempt = 0
153149
while True:
154-
suffix = f'-{attempt}' if attempt > 0 else ''
150+
suffix = f"-{attempt}" if attempt > 0 else ""
155151
try_filename = trim_filename_if_needed(filename, extra=len(suffix))
156152
try_filename += suffix
157153
if not exists(try_filename):
@@ -161,12 +157,7 @@ def get_unique_filename(filename: str, exists=os.path.exists) -> str:
161157

162158
class Downloader:
163159

164-
def __init__(
165-
self,
166-
env: Environment,
167-
output_file: IO = None,
168-
resume: bool = False
169-
):
160+
def __init__(self, env: Environment, output_file: IO = None, resume: bool = False):
170161
"""
171162
:param resume: Should the download resume if partial download
172163
already exists.
@@ -190,19 +181,17 @@ def pre_request(self, request_headers: dict):
190181
191182
"""
192183
# Ask the server not to encode the content so that we can resume, etc.
193-
request_headers['Accept-Encoding'] = 'identity'
184+
request_headers["Accept-Encoding"] = "identity"
194185
if self._resume:
195186
bytes_have = os.path.getsize(self._output_file.name)
196187
if bytes_have:
197188
# Set ``Range`` header to resume the download
198189
# TODO: Use "If-Range: mtime" to make sure it's fresh?
199-
request_headers['Range'] = f'bytes={bytes_have}-'
190+
request_headers["Range"] = f"bytes={bytes_have}-"
200191
self._resumed_from = bytes_have
201192

202193
def start(
203-
self,
204-
initial_url: str,
205-
final_response: requests.Response
194+
self, initial_url: str, final_response: requests.Response
206195
) -> Tuple[RawStream, IO]:
207196
"""
208197
Initiate and return a stream for `response` body with progress
@@ -216,13 +205,27 @@ def start(
216205
"""
217206
assert not self.status.time_started
218207

219-
# FIXME: some servers still might sent Content-Encoding: gzip
220-
# <https://github.com/httpie/cli/issues/423>
208+
# Some servers may still send a compressed body even though
209+
# we ask for identity encoding. In that case, ``Content-Length``
210+
# refers to the encoded size (RFC 9110 § 8.6), so we disable
211+
# automatic decoding to make our byte tracking match.
221212
try:
222-
total_size = int(final_response.headers['Content-Length'])
213+
total_size = int(final_response.headers["Content-Length"])
223214
except (KeyError, ValueError, TypeError):
224215
total_size = None
225216

217+
content_encoding = final_response.headers.get("Content-Encoding")
218+
if content_encoding:
219+
final_response.raw.decode_content = False
220+
221+
class EncodedHTTPResponse(HTTPResponse):
222+
def iter_body(self, chunk_size=1): # type: ignore[override]
223+
return final_response.raw.stream(chunk_size, decode_content=False)
224+
225+
response_msg = EncodedHTTPResponse(final_response)
226+
else:
227+
response_msg = HTTPResponse(final_response)
228+
226229
if not self._output_file:
227230
self._output_file = self._get_output_file_from_response(
228231
initial_url=initial_url,
@@ -232,8 +235,7 @@ def start(
232235
# `--output, -o` provided
233236
if self._resume and final_response.status_code == PARTIAL_CONTENT:
234237
total_size = parse_content_range(
235-
final_response.headers.get('Content-Range'),
236-
self._resumed_from
238+
final_response.headers.get("Content-Range"), self._resumed_from
237239
)
238240

239241
else:
@@ -244,17 +246,19 @@ def start(
244246
except OSError:
245247
pass # stdout
246248

247-
output_options = OutputOptions.from_message(final_response, headers=False, body=True)
249+
output_options = OutputOptions.from_message(
250+
final_response, headers=False, body=True
251+
)
248252
stream = RawStream(
249-
msg=HTTPResponse(final_response),
253+
msg=response_msg,
250254
output_options=output_options,
251255
on_body_chunk_downloaded=self.chunk_downloaded,
252256
)
253257

254258
self.status.started(
255259
output_file=self._output_file,
256260
resumed_from=self._resumed_from,
257-
total_size=total_size
261+
total_size=total_size,
258262
)
259263

260264
return stream, self._output_file
@@ -292,16 +296,17 @@ def _get_output_file_from_response(
292296
) -> IO:
293297
# Output file not specified. Pick a name that doesn't exist yet.
294298
filename = None
295-
if 'Content-Disposition' in final_response.headers:
299+
if "Content-Disposition" in final_response.headers:
296300
filename = filename_from_content_disposition(
297-
final_response.headers['Content-Disposition'])
301+
final_response.headers["Content-Disposition"]
302+
)
298303
if not filename:
299304
filename = filename_from_url(
300305
url=initial_url,
301-
content_type=final_response.headers.get('Content-Type'),
306+
content_type=final_response.headers.get("Content-Type"),
302307
)
303308
unique_filename = get_unique_filename(filename)
304-
return open(unique_filename, buffering=0, mode='a+b')
309+
return open(unique_filename, buffering=0, mode="a+b")
305310

306311

307312
class DownloadStatus:
@@ -325,11 +330,11 @@ def started(self, output_file, resumed_from=0, total_size=None):
325330
def start_display(self, output_file):
326331
from httpie.output.ui.rich_progress import (
327332
DummyDisplay,
333+
ProgressDisplay,
328334
StatusDisplay,
329-
ProgressDisplay
330335
)
331336

332-
message = f'Downloading to {output_file.name}'
337+
message = f"Downloading to {output_file.name}"
333338
if self.env.show_displays:
334339
if self.total_size is None:
335340
# Rich does not support progress bars without a total
@@ -341,9 +346,7 @@ def start_display(self, output_file):
341346
self.display = DummyDisplay(self.env)
342347

343348
self.display.start(
344-
total=self.total_size,
345-
at=self.downloaded,
346-
description=message
349+
total=self.total_size, at=self.downloaded, description=message
347350
)
348351

349352
def chunk_downloaded(self, size):
@@ -357,10 +360,7 @@ def has_finished(self):
357360

358361
@property
359362
def time_spent(self):
360-
if (
361-
self.time_started is not None
362-
and self.time_finished is not None
363-
):
363+
if self.time_started is not None and self.time_finished is not None:
364364
return self.time_finished - self.time_started
365365
else:
366366
return None
@@ -369,9 +369,9 @@ def finished(self):
369369
assert self.time_started is not None
370370
assert self.time_finished is None
371371
self.time_finished = monotonic()
372-
if hasattr(self, 'display'):
372+
if hasattr(self, "display"):
373373
self.display.stop(self.time_spent)
374374

375375
def terminate(self):
376-
if hasattr(self, 'display'):
376+
if hasattr(self, "display"):
377377
self.display.stop(self.time_spent)

0 commit comments

Comments
 (0)