Skip to content

Commit ad8aead

Browse files
kientzlemmatuska
authored andcommitted
Issue 2548: Reading GNU sparse entries (libarchive#2558)
My attempt to fix libarchive#2404 just made the confusion between the size of the extracted file and the size of the contents in the tar archive worse than it was before. @ferivoz in libarchive#2557 showed that the confusion stemmed from a point where we were setting the size in the entry (which is by definition the size of the file on disk) when we read the `GNU.sparse.size` and `GNU.sparse.realsize` attributes (which might represent the size on disk or in the archive) and then using that to determine whether to read the value in ustar header (which represents the size of the data in the archive). The confusion stems from three issues: * The GNU.sparse.* fields mean different things depending on the version of GNU tar used. * The regular Pax `size` field overrides the value in the ustar header, but the GNU sparse size fields don't always do so. * The previous libarchive code tried to reconcile different size information as we went along, which is problematic because the order in which this information appears can vary. This PR makes one big structural change: We now have separate storage for every different size field we might encounter. We now just store these values and record which one we saw. Then at the end, when we have all the information available at once, we can use this data to determine the size on disk and the size in the archive. A few key facts about GNU sparse formats: * GNU legacy sparse format: Stored all the relevant info in an extension of the ustar header. * GNU pax 0.0 format: Used `GNU.sparse.size` to store the size on disk * GNU pax 0.1 format: Used `GNU.sparse.size` to store the size on disk * GNU pax 1.0 format: Used `GNU.sparse.realsize` to store the size on disk; repurposed `GNU.sparse.size` to store the size in the archive, but omitted this in favor of the ustar size field when that could be used. And of course, some key precedence information: * Pax `size` field always overrides the ustar header size field. * GNU sparse size fields override it ONLY when they represent the size of the data in the archive. Resolves libarchive#2548 (cherry picked from commit 29fd918)
1 parent 16e87c1 commit ad8aead

6 files changed

+176
-71
lines changed

Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,7 @@ libarchive_test_SOURCES= \
490490
libarchive/test/test_read_format_gtar_gz.c \
491491
libarchive/test/test_read_format_gtar_lzma.c \
492492
libarchive/test/test_read_format_gtar_sparse.c \
493+
libarchive/test/test_read_format_gtar_sparse_length.c \
493494
libarchive/test/test_read_format_gtar_sparse_skip_entry.c \
494495
libarchive/test/test_read_format_huge_rpm.c \
495496
libarchive/test/test_read_format_iso_Z.c \
@@ -841,6 +842,7 @@ libarchive_test_EXTRA_DIST=\
841842
libarchive/test/test_read_format_gtar_sparse_1_17_posix01.tar.uu \
842843
libarchive/test/test_read_format_gtar_sparse_1_17_posix10.tar.uu \
843844
libarchive/test/test_read_format_gtar_sparse_1_17_posix10_modified.tar.uu \
845+
libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu \
844846
libarchive/test/test_read_format_gtar_sparse_skip_entry.tar.Z.uu \
845847
libarchive/test/test_read_format_huge_rpm.rpm.uu \
846848
libarchive/test/test_read_format_iso.iso.Z.uu \

libarchive/archive_read_support_format_tar.c

Lines changed: 99 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,11 @@ struct tar {
129129
int64_t entry_offset;
130130
int64_t entry_padding;
131131
int64_t entry_bytes_unconsumed;
132-
int64_t realsize;
132+
int64_t disk_size;
133+
int64_t GNU_sparse_realsize;
134+
int64_t GNU_sparse_size;
135+
int64_t SCHILY_sparse_realsize;
136+
int64_t pax_size;
133137
struct sparse_block *sparse_list;
134138
struct sparse_block *sparse_last;
135139
int64_t sparse_offset;
@@ -138,6 +142,7 @@ struct tar {
138142
int sparse_gnu_minor;
139143
char sparse_gnu_attributes_seen;
140144
char filetype;
145+
char size_fields; /* Bits defined below */
141146

142147
struct archive_string localname;
143148
struct archive_string_conv *opt_sconv;
@@ -148,9 +153,15 @@ struct tar {
148153
int compat_2x;
149154
int process_mac_extensions;
150155
int read_concatenated_archives;
151-
int realsize_override;
152156
};
153157

158+
/* Track which size fields were present in the headers */
159+
#define TAR_SIZE_PAX_SIZE 1
160+
#define TAR_SIZE_GNU_SPARSE_REALSIZE 2
161+
#define TAR_SIZE_GNU_SPARSE_SIZE 4
162+
#define TAR_SIZE_SCHILY_SPARSE_REALSIZE 8
163+
164+
154165
static int archive_block_is_null(const char *p);
155166
static char *base64_decode(const char *, size_t, size_t *);
156167
static int gnu_add_sparse_entry(struct archive_read *, struct tar *,
@@ -529,8 +540,7 @@ archive_read_format_tar_read_header(struct archive_read *a,
529540
tar = (struct tar *)(a->format->data);
530541
tar->entry_offset = 0;
531542
gnu_clear_sparse_list(tar);
532-
tar->realsize = -1; /* Mark this as "unset" */
533-
tar->realsize_override = 0;
543+
tar->size_fields = 0; /* We don't have any size info yet */
534544

535545
/* Setup default string conversion. */
536546
tar->sconv = tar->opt_sconv;
@@ -622,7 +632,7 @@ archive_read_format_tar_read_data(struct archive_read *a,
622632
tar->entry_padding = 0;
623633
*buff = NULL;
624634
*size = 0;
625-
*offset = tar->realsize;
635+
*offset = tar->disk_size;
626636
return (ARCHIVE_EOF);
627637
}
628638

@@ -1290,6 +1300,11 @@ read_body_to_string(struct archive_read *a, struct tar *tar,
12901300
* allows header_old_tar and header_ustar
12911301
* to handle filenames differently, while still putting most of the
12921302
* common parsing into one place.
1303+
*
1304+
* This is called _after_ ustar, GNU tar, Schily, etc, special
1305+
* fields have already been parsed into the `tar` structure.
1306+
* So we can make final decisions here about how to reconcile
1307+
* size, mode, etc, information.
12931308
*/
12941309
static int
12951310
header_common(struct archive_read *a, struct tar *tar,
@@ -1323,28 +1338,60 @@ header_common(struct archive_read *a, struct tar *tar,
13231338
archive_entry_set_mtime(entry, tar_atol(header->mtime, sizeof(header->mtime)), 0);
13241339
}
13251340

1326-
/* Update size information as appropriate */
1327-
if (!archive_entry_size_is_set(entry)) {
1328-
tar->entry_bytes_remaining = tar_atol(header->size, sizeof(header->size));
1329-
if (tar->entry_bytes_remaining < 0) {
1330-
tar->entry_bytes_remaining = 0;
1331-
archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
1332-
"Tar entry has negative size");
1333-
return (ARCHIVE_FATAL);
1334-
}
1335-
if (tar->entry_bytes_remaining > entry_limit) {
1336-
tar->entry_bytes_remaining = 0;
1337-
archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
1338-
"Tar entry size overflow");
1339-
return (ARCHIVE_FATAL);
1340-
}
1341-
if (!tar->realsize_override) {
1342-
tar->realsize = tar->entry_bytes_remaining;
1343-
}
1344-
archive_entry_set_size(entry, tar->realsize);
1345-
} else if (tar->realsize_override) {
1346-
tar->entry_bytes_remaining = tar->realsize;
1347-
archive_entry_set_size(entry, tar->realsize);
1341+
/* Reconcile the size info. */
1342+
/* First, how big is the file on disk? */
1343+
if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_REALSIZE) != 0) {
1344+
/* GNU sparse format 1.0 uses `GNU.sparse.realsize`
1345+
* to hold the size of the file on disk. */
1346+
tar->disk_size = tar->GNU_sparse_realsize;
1347+
} else if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_SIZE) != 0
1348+
&& (tar->sparse_gnu_major == 0)) {
1349+
/* GNU sparse format 0.0 and 0.1 use `GNU.sparse.size`
1350+
* to hold the size of the file on disk. */
1351+
tar->disk_size = tar->GNU_sparse_size;
1352+
} else if ((tar->size_fields & TAR_SIZE_SCHILY_SPARSE_REALSIZE) != 0) {
1353+
tar->disk_size = tar->SCHILY_sparse_realsize;
1354+
} else if ((tar->size_fields & TAR_SIZE_PAX_SIZE) != 0) {
1355+
tar->disk_size = tar->pax_size;
1356+
} else {
1357+
/* There wasn't a suitable pax header, so use the ustar info */
1358+
tar->disk_size = tar_atol(header->size, sizeof(header->size));
1359+
}
1360+
1361+
if (tar->disk_size < 0) {
1362+
archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
1363+
"Tar entry has negative file size");
1364+
return (ARCHIVE_FATAL);
1365+
} else if (tar->disk_size > entry_limit) {
1366+
archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
1367+
"Tar entry size overflow");
1368+
return (ARCHIVE_FATAL);
1369+
} else {
1370+
archive_entry_set_size(entry, tar->disk_size);
1371+
}
1372+
1373+
/* Second, how big is the data in the archive? */
1374+
if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_SIZE) != 0
1375+
&& (tar->sparse_gnu_major == 1)) {
1376+
/* GNU sparse format 1.0 uses `GNU.sparse.size`
1377+
* to hold the size of the data in the archive. */
1378+
tar->entry_bytes_remaining = tar->GNU_sparse_size;
1379+
} else if ((tar->size_fields & TAR_SIZE_PAX_SIZE) != 0) {
1380+
tar->entry_bytes_remaining = tar->pax_size;
1381+
} else {
1382+
tar->entry_bytes_remaining
1383+
= tar_atol(header->size, sizeof(header->size));
1384+
}
1385+
if (tar->entry_bytes_remaining < 0) {
1386+
tar->entry_bytes_remaining = 0;
1387+
archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
1388+
"Tar entry has negative size");
1389+
return (ARCHIVE_FATAL);
1390+
} else if (tar->entry_bytes_remaining > entry_limit) {
1391+
tar->entry_bytes_remaining = 0;
1392+
archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
1393+
"Tar entry size overflow");
1394+
return (ARCHIVE_FATAL);
13481395
}
13491396

13501397
/* Handle the tar type flag appropriately. */
@@ -2299,10 +2346,13 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
22992346
}
23002347
else if (key_length == 4 && memcmp(key, "size", 4) == 0) {
23012348
/* GNU.sparse.size */
2349+
/* This is either the size of stored entry OR the size of data on disk,
2350+
* depending on which GNU sparse format version is in use.
2351+
* Since pax attributes can be in any order, we may not actually
2352+
* know at this point how to interpret this. */
23022353
if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
2303-
tar->realsize = t;
2304-
archive_entry_set_size(entry, tar->realsize);
2305-
tar->realsize_override = 1;
2354+
tar->GNU_sparse_size = t;
2355+
tar->size_fields |= TAR_SIZE_GNU_SPARSE_SIZE;
23062356
}
23072357
return (err);
23082358
}
@@ -2370,11 +2420,10 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
23702420
return (err);
23712421
}
23722422
else if (key_length == 8 && memcmp(key, "realsize", 8) == 0) {
2373-
/* GNU.sparse.realsize */
2423+
/* GNU.sparse.realsize = size of file on disk */
23742424
if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
2375-
tar->realsize = t;
2376-
archive_entry_set_size(entry, tar->realsize);
2377-
tar->realsize_override = 1;
2425+
tar->GNU_sparse_realsize = t;
2426+
tar->size_fields |= TAR_SIZE_GNU_SPARSE_REALSIZE;
23782427
}
23792428
return (err);
23802429
}
@@ -2555,12 +2604,12 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
25552604
}
25562605
else if (key_length == 8 && memcmp(key, "realsize", 8) == 0) {
25572606
if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
2558-
tar->realsize = t;
2559-
tar->realsize_override = 1;
2560-
archive_entry_set_size(entry, tar->realsize);
2607+
tar->SCHILY_sparse_realsize = t;
2608+
tar->size_fields |= TAR_SIZE_SCHILY_SPARSE_REALSIZE;
25612609
}
25622610
return (err);
25632611
}
2612+
/* TODO: Is there a SCHILY.sparse.size similar to GNU.sparse.size ? */
25642613
else if (key_length > 6 && memcmp(key, "xattr.", 6) == 0) {
25652614
key_length -= 6;
25662615
key += 6;
@@ -2727,19 +2776,8 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
27272776
if (key_length == 4 && memcmp(key, "size", 4) == 0) {
27282777
/* "size" is the size of the data in the entry. */
27292778
if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
2730-
tar->entry_bytes_remaining = t;
2731-
/*
2732-
* The "size" pax header keyword always overrides the
2733-
* "size" field in the tar header.
2734-
* GNU.sparse.realsize, GNU.sparse.size and
2735-
* SCHILY.realsize override this value.
2736-
*/
2737-
if (!tar->realsize_override) {
2738-
archive_entry_set_size(entry,
2739-
tar->entry_bytes_remaining);
2740-
tar->realsize
2741-
= tar->entry_bytes_remaining;
2742-
}
2779+
tar->pax_size = t;
2780+
tar->size_fields |= TAR_SIZE_PAX_SIZE;
27432781
}
27442782
else if (t == INT64_MAX) {
27452783
/* Note: pax_attr_read_number returns INT64_MAX on overflow or < 0 */
@@ -2851,11 +2889,6 @@ header_gnutar(struct archive_read *a, struct tar *tar,
28512889
* filename is stored as in old-style archives.
28522890
*/
28532891

2854-
/* Grab fields common to all tar variants. */
2855-
err = header_common(a, tar, entry, h);
2856-
if (err == ARCHIVE_FATAL)
2857-
return (err);
2858-
28592892
/* Copy filename over (to ensure null termination). */
28602893
header = (const struct archive_entry_header_gnutar *)h;
28612894
const char *existing_pathname = archive_entry_pathname(entry);
@@ -2904,8 +2937,6 @@ header_gnutar(struct archive_read *a, struct tar *tar,
29042937
archive_entry_set_rdev(entry, 0);
29052938
}
29062939

2907-
tar->entry_padding = 0x1ff & (-tar->entry_bytes_remaining);
2908-
29092940
/* Grab GNU-specific fields. */
29102941
if (!archive_entry_atime_is_set(entry)) {
29112942
t = tar_atol(header->atime, sizeof(header->atime));
@@ -2919,10 +2950,10 @@ header_gnutar(struct archive_read *a, struct tar *tar,
29192950
}
29202951

29212952
if (header->realsize[0] != 0) {
2922-
tar->realsize
2953+
/* Treat as a synonym for the pax GNU.sparse.realsize attr */
2954+
tar->GNU_sparse_realsize
29232955
= tar_atol(header->realsize, sizeof(header->realsize));
2924-
archive_entry_set_size(entry, tar->realsize);
2925-
tar->realsize_override = 1;
2956+
tar->size_fields |= TAR_SIZE_GNU_SPARSE_REALSIZE;
29262957
}
29272958

29282959
if (header->sparse[0].offset[0] != 0) {
@@ -2935,6 +2966,13 @@ header_gnutar(struct archive_read *a, struct tar *tar,
29352966
}
29362967
}
29372968

2969+
/* Grab fields common to all tar variants. */
2970+
err = header_common(a, tar, entry, h);
2971+
if (err == ARCHIVE_FATAL)
2972+
return (err);
2973+
2974+
tar->entry_padding = 0x1ff & (-tar->entry_bytes_remaining);
2975+
29382976
return (err);
29392977
}
29402978

@@ -3114,8 +3152,7 @@ gnu_sparse_01_parse(struct archive_read *a, struct tar *tar, const char *p, size
31143152
* it's not possible to support both variants. This code supports
31153153
* the later variant at the expense of not supporting the former.
31163154
*
3117-
* This variant also replaced GNU.sparse.size with GNU.sparse.realsize
3118-
* and introduced the GNU.sparse.major/GNU.sparse.minor attributes.
3155+
* This variant also introduced the GNU.sparse.major/GNU.sparse.minor attributes.
31193156
*/
31203157

31213158
/*

libarchive/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ IF(ENABLE_TEST)
134134
test_read_format_gtar_gz.c
135135
test_read_format_gtar_lzma.c
136136
test_read_format_gtar_sparse.c
137+
test_read_format_gtar_sparse_length.c
137138
test_read_format_gtar_sparse_skip_entry.c
138139
test_read_format_huge_rpm.c
139140
test_read_format_iso_Z.c
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*-
2+
* Copyright (c) 2003-2025 Tim Kientzle
3+
* All rights reserved.
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions
7+
* are met:
8+
* 1. Redistributions of source code must retain the above copyright
9+
* notice, this list of conditions and the following disclaimer.
10+
* 2. Redistributions in binary form must reproduce the above copyright
11+
* notice, this list of conditions and the following disclaimer in the
12+
* documentation and/or other materials provided with the distribution.
13+
*
14+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15+
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16+
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17+
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18+
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19+
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23+
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24+
*/
25+
#include "test.h"
26+
27+
28+
DEFINE_TEST(test_read_format_gtar_sparse_length)
29+
{
30+
const char *refname = "test_read_format_gtar_sparse_length.tar.Z";
31+
int err;
32+
struct archive *a;
33+
struct archive_entry *ae;
34+
35+
extract_reference_file(refname);
36+
37+
assert((a = archive_read_new()) != NULL);
38+
assert(0 == archive_read_support_filter_all(a));
39+
assert(0 == archive_read_support_format_tar(a));
40+
failure("Can't open %s", refname);
41+
assert(0 == archive_read_open_filename(a, refname, 3));
42+
43+
err = archive_read_next_header(a, &ae);
44+
assertEqualIntA(a, ARCHIVE_OK, err);
45+
err = archive_read_next_header(a, &ae);
46+
assertEqualIntA(a, ARCHIVE_OK, err);
47+
48+
err = archive_read_next_header(a, &ae);
49+
assertEqualIntA(a, ARCHIVE_EOF, err);
50+
51+
assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
52+
assertEqualInt(ARCHIVE_OK, archive_read_free(a));
53+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
begin 644 test_read_format_gtar_sparse_length.tar.Z
2+
M'YV09-+(>0$E#!XD9<*0*3/03!HV96(`F$BQHL6+&#-JW,BQ(T48(&W0H`$"
3+
M`,@8-VK$*'DRY4J3(&.>C`$#1`P:-V#,J#$CAHP8-D#``%IC)``0>#PJ7<JT
4+
MJ5.,=>;0"2,')APV=<Z(J?.0S-.*5[-N[?KUH\R:,&66+,NVK=NW<.'*D`'B
5+
MB),J+N;`H3JGC(LV8=2\D=,CAH*Y=>_FW2NG[]\T;@;W@'$81V*\>OGZE9.0
6+
MS9PT>LH4EG$CAX*XJ%.K7LVZM>O7L&/+GLTQX$"[5:8P[FODH5\8+QQ"E$C;
7+
M=4R1)&&B5,ERJ,OF9VF"M(E3)T^?0(6BG#'CQE$8Q<,WC3JU*LBP6KFR\<H6
8+
M_=CU;<^BE;]6O/W[KF4HH$S99^G]I^$GX(`$%FC@@0@F.*!M,KR@(&TQI51#
9+
M<\N]U!)S:<DWW4TY[=333T$-U1T-,AQ5PX/VD4>555BE1U9[+;['7EGT93@=
10+
.BCCFJ...//;HXX]`_@@`
11+
`
12+
end

0 commit comments

Comments
 (0)