Skip to content

Commit a902fcb

Browse files
authored
Merge pull request #83 from nlnwa/feat/warc-date-precision
feat: Write WARC-Date with up to nanosecond precision
2 parents 2cff962 + 31e9046 commit a902fcb

File tree

11 files changed

+103
-1734
lines changed

11 files changed

+103
-1734
lines changed

go.mod

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,23 @@
11
module github.com/nlnwa/gowarc/v2
22

3-
go 1.22.0
3+
go 1.23.0
44

5-
toolchain go1.22.7
5+
toolchain go1.23.7
66

77
require (
88
github.com/google/uuid v1.6.0
9-
github.com/klauspost/compress v1.17.11
10-
github.com/nlnwa/whatwg-url v0.5.0
11-
github.com/prometheus/prometheus v0.55.0
12-
github.com/stretchr/testify v1.9.0
9+
github.com/klauspost/compress v1.18.0
10+
github.com/nlnwa/whatwg-url v0.6.2
11+
github.com/prometheus/prometheus v0.302.1
12+
github.com/stretchr/testify v1.10.0
1313
)
1414

1515
require (
16-
github.com/bits-and-blooms/bitset v1.14.3 // indirect
16+
github.com/bits-and-blooms/bitset v1.22.0 // indirect
1717
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
18-
github.com/kr/pretty v0.3.1 // indirect
19-
github.com/pkg/errors v0.9.1 // indirect
2018
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
21-
golang.org/x/net v0.30.0 // indirect
22-
golang.org/x/sys v0.26.0 // indirect
23-
golang.org/x/text v0.19.0 // indirect
19+
golang.org/x/net v0.38.0 // indirect
20+
golang.org/x/sys v0.31.0 // indirect
21+
golang.org/x/text v0.23.0 // indirect
2422
gopkg.in/yaml.v3 v3.0.1 // indirect
2523
)

go.sum

Lines changed: 65 additions & 1683 deletions
Large diffs are not rendered by default.

httpblock.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type HttpResponseBlock interface {
4242
HttpHeader() *http.Header
4343
}
4444

45-
var missingEndOfHeaders = errors.New("missing line separator at end of http headers")
45+
var errMissingEndOfHeaders = errors.New("missing line separator at end of http headers")
4646

4747
type httpRequestBlock struct {
4848
opts *warcRecordOptions
@@ -335,7 +335,7 @@ func headerBytes(r buffer) ([]byte, error) {
335335
}
336336
var err error
337337
if !sepFound {
338-
err = missingEndOfHeaders
338+
err = errMissingEndOfHeaders
339339
}
340340
return result.Bytes(), err
341341
}
@@ -369,7 +369,7 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig
369369
}
370370
}
371371

372-
if herr == missingEndOfHeaders && opts.fixSyntaxErrors {
372+
if herr == errMissingEndOfHeaders && opts.fixSyntaxErrors {
373373
// Fix header and update content-length field
374374
hb = append(hb, '\r', '\n')
375375
l, _ := wf.GetInt64(ContentLength)
@@ -396,7 +396,7 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig
396396
payloadDigest: payloadDigest,
397397
}
398398

399-
if herr == missingEndOfHeaders && !opts.fixSyntaxErrors {
399+
if herr == errMissingEndOfHeaders && !opts.fixSyntaxErrors {
400400
// We have to fix the header for parsing even if we don't fix the record
401401
hb = append(hb, '\r', '\n')
402402
}
@@ -418,7 +418,7 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig
418418
payloadDigest: payloadDigest,
419419
}
420420

421-
if herr == missingEndOfHeaders && !opts.fixSyntaxErrors {
421+
if herr == errMissingEndOfHeaders && !opts.fixSyntaxErrors {
422422
// We have to fix the header for parsing even if we don't fix the record
423423
hb = append(hb, '\r', '\n')
424424
}

internal/timestamp/timestamp.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,6 @@ func From14ToTime(s string) (time.Time, error) {
3434
return t, err
3535
}
3636

37-
func UTC(t time.Time) time.Time {
38-
return t.In(time.UTC)
39-
}
40-
4137
func UTC14(t time.Time) string {
42-
return t.In(time.UTC).Format("20060102150405")
43-
}
44-
45-
func UTCW3cIso8601(t time.Time) string {
46-
return t.In(time.UTC).Format(time.RFC3339)
38+
return t.UTC().Format("20060102150405")
4739
}

internal/timestamp/timestamp_test.go

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -75,26 +75,10 @@ func TestFrom14ToTimeSucceedsOnValidString(t *testing.T) {
7575
}
7676
}
7777

78-
func TestUTC(t *testing.T) {
79-
data := createTestData()
80-
81-
if ts := timestamp.UTC(data.time); ts != data.time {
82-
t.Errorf("UTC() = %s, want %s", ts, data.time)
83-
}
84-
}
85-
8678
func TestUTC14(t *testing.T) {
8779
data := createTestData()
8880

8981
if ts := timestamp.UTC14(data.time); ts != data.gowarc14Date {
9082
t.Errorf("UTC14() = %s, want %s", ts, data.gowarc14Date)
9183
}
9284
}
93-
94-
func TestUTCW3cIso8601(t *testing.T) {
95-
data := createTestData()
96-
97-
if ts := timestamp.UTCW3cIso8601(data.time); ts != data.iso8601Date {
98-
t.Errorf("UTCW3cIso8601() = %s, want %s", ts, data.iso8601Date)
99-
}
100-
}

recordbuilder.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,11 @@ func (rb *recordBuilder) AddWarcHeaderInt64(name string, value int64) {
8080

8181
// AddWarcHeaderTime adds a new WARC header field with the given name and a time.Time value to the record
8282
func (rb *recordBuilder) AddWarcHeaderTime(name string, value time.Time) {
83-
rb.headers.AddTime(name, value)
83+
if rb.version.id == V1_0.id {
84+
rb.headers.AddTime(name, value)
85+
} else {
86+
rb.headers.AddTimeNano(name, value)
87+
}
8488
}
8589

8690
// Close releases resources used by the WarcRecordBuilder

recordbuilder_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ func TestRecordBuilder(t *testing.T) {
195195
"Referer: http://example.com/foo.html\n" +
196196
"Connection: close\n" +
197197
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n",
198-
&Validation{missingEndOfHeaders},
198+
&Validation{errMissingEndOfHeaders},
199199
true,
200200
},
201201
false,

unmarshaler_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
836836
"Referer: http://www.archive.org/\r\n" +
837837
"Host: www.archive.org\r\n" +
838838
"Cookie: PHPSESSID=009d7bb11022f80605aa87e18224d824\r\n",
839-
&Validation{missingEndOfHeaders},
839+
&Validation{errMissingEndOfHeaders},
840840
true,
841841
},
842842
0,

warcfields.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"fmt"
2121
"io"
2222
"net/http"
23+
"slices"
2324
"sort"
2425
"strconv"
2526
"strings"
@@ -141,6 +142,14 @@ func (wf *WarcFields) AddTime(name string, value time.Time) {
141142
wf.Add(name, value.UTC().Format(time.RFC3339))
142143
}
143144

145+
// AddTimeNano adds the key, value pair to the header.
146+
// It appends to any existing values associated with key. The key is case-insensitive.
147+
//
148+
// The value is formatted as RFC 3339 with up to nanosecond precision.
149+
func (wf *WarcFields) AddTimeNano(name string, value time.Time) {
150+
wf.Add(name, value.UTC().Format(time.RFC3339Nano))
151+
}
152+
144153
// AddId adds the key, value pair to the header.
145154
// It appends to any existing values associated with key. The key is case-insensitive.
146155
//
@@ -163,7 +172,7 @@ func (wf *WarcFields) Set(name string, value string) {
163172
for idx, nv := range *wf {
164173
if nv.Name == name {
165174
if isSet {
166-
*wf = append((*wf)[:idx], (*wf)[idx+1:]...)
175+
*wf = slices.Delete(*wf, idx, idx+1)
167176
} else {
168177
nv.Value = value
169178
isSet = true

warcfieldsparser.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ import (
2525
)
2626

2727
var (
28-
colon = []byte{':'}
29-
endOfHeaders = errors.New("EOH")
28+
colon = []byte{':'}
29+
errEndOfHeaders = errors.New("EOH")
3030
)
3131

3232
type warcfieldsParser struct {
@@ -66,7 +66,7 @@ func (p *warcfieldsParser) readLine(r *bufio.Reader, pos *position) (line []byte
6666
line, err = r.ReadBytes('\n')
6767
if err != nil {
6868
if err == io.EOF {
69-
err = endOfHeaders
69+
err = errEndOfHeaders
7070
}
7171
line = bytes.Trim(line, sphtcrlf)
7272
return
@@ -97,7 +97,7 @@ func (p *warcfieldsParser) Parse(r *bufio.Reader, validation *Validation, pos *p
9797
for {
9898
line, nc, err := p.readLine(r, pos.incrLineNumber())
9999
if err != nil {
100-
if err == endOfHeaders {
100+
if err == errEndOfHeaders {
101101
eoh = true
102102
if len(line) == 0 {
103103
return &wf, nil

0 commit comments

Comments
 (0)