Skip to content

Commit 14f3937

Browse files
authored
Add Support for DATE_TIMEs in STATA (#325)
1 parent 4cae7c9 commit 14f3937

File tree

4 files changed

+109
-4
lines changed

4 files changed

+109
-4
lines changed

src/bin/extract_metadata.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,15 +155,15 @@ static int handle_variable_sav(int index, readstat_variable_t *variable, const c
155155
} else if (hasPrefix(vformat, "YMDHMS16") == 0) {
156156
// e.g. 2013-01-31 1:02
157157
format = EXTRACT_METADATA_FORMAT_DATE_TIME;
158-
pattern = "yyyy-MM-dd h:mm";
158+
pattern = "yyyy-MM-dd hh:mm";
159159
} else if (hasPrefix(vformat, "YMDHMS19") == 0) {
160160
// e.g. 2013-01-31 1:02:33
161161
format = EXTRACT_METADATA_FORMAT_DATE_TIME;
162-
pattern = "yyyy-MM-dd h:mm:ss";
162+
pattern = "yyyy-MM-dd hh:mm:ss";
163163
} else if (hasPrefix(vformat, "YMDHMS19.2") == 0) {
164164
// e.g. 2013-01-31 1:02:33.72
165165
format = EXTRACT_METADATA_FORMAT_DATE_TIME;
166-
pattern = "yyyy-MM-dd h:mm:ss.SS+";
166+
pattern = "yyyy-MM-dd hh:mm:ss.SS+";
167167
} else if (hasPrefix(vformat, "MTIME5") == 0) {
168168
// e.g. 1754:36
169169
format = EXTRACT_METADATA_FORMAT_TIME;

src/bin/read_csv/csv_metadata.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ typedef struct csv_metadata {
1212
void *user_ctx;
1313
readstat_variable_t *variables;
1414
int* is_date;
15+
int* is_date_time;
1516
struct json_metadata *json_md;
1617
rs_read_module_t *output_module;
1718
} csv_metadata;

src/bin/read_csv/mod_dta.c

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ void produce_column_header_dta(void *csv_metadata, const char *column, readstat_
206206
break;
207207
case EXTRACT_METADATA_FORMAT_TIME:
208208
case EXTRACT_METADATA_FORMAT_DATE_TIME:
209-
var->type = READSTAT_TYPE_INT32;
209+
var->type = READSTAT_TYPE_DOUBLE;
210210
snprintf(var->format, sizeof(var->format), "%s", "%tC");
211211
// %tC => is equivalent to coordinated universal time (UTC)
212212
break;
@@ -385,17 +385,115 @@ static readstat_value_t value_double_dta(const char *s, size_t len, struct csv_m
385385
return value;
386386
}
387387

388+
static readstat_value_t value_double_date_time_dta(const char *s, size_t len, struct csv_metadata *c) {
389+
// Handle empty or NULL strings as missing values
390+
if (s == NULL || len == 0 || *s == '\0') {
391+
readstat_value_t value = {
392+
.type = READSTAT_TYPE_DOUBLE,
393+
.is_system_missing = 1,
394+
.v = { .double_value = NAN }
395+
};
396+
return value;
397+
}
398+
399+
// Truncate the date string to 23 characters to remove the timezone offset and
400+
// microseconds, if present. STATA does not support timezones or microseconds.
401+
char date_time[24];
402+
snprintf(date_time, sizeof(date_time), "%s", s);
403+
404+
// Parse date-time components
405+
int year, month, day, hour, minute, second, msecs = 0;
406+
int matched = sscanf(
407+
date_time,
408+
"%d-%d-%d %d:%d:%d.%d",
409+
&year, &month, &day, &hour, &minute, &second, &msecs
410+
);
411+
if (matched < 6 || matched > 8) {
412+
fprintf(stderr, "%s:%d not a valid date-time: %s (expected format: yyyy-mm-dd hh:MM:SS with optional milliseconds. Datetime string is truncated at 23 characters to ignore microseconds and timezone information.)\n", __FILE__, __LINE__, date_time);
413+
exit(EXIT_FAILURE);
414+
}
415+
416+
// Get days since the epoch for the date
417+
char days_since_epoch_string[11];
418+
snprintf(days_since_epoch_string, sizeof(days_since_epoch_string), "%04d-%02d-%02d", year, month, day);
419+
char* dest;
420+
int days_since_epoch = readstat_dta_num_days(days_since_epoch_string, &dest);
421+
422+
// Add the hours, minutes, and seconds to the days
423+
double msecs_since_epoch = 86400000.0 * days_since_epoch + hour * 3600000.0 + minute * 60000.0 + second * 1000.0 + msecs * 1.0;
424+
425+
// Adjust for leap seconds; 27 have occurred as of writing this code
426+
// https://en.m.wikipedia.org/wiki/Leap_second
427+
typedef struct {
428+
int year;
429+
int month;
430+
int day;
431+
} leap_second_date;
432+
433+
leap_second_date leap_seconds[] = {
434+
{1972, 6, 30}, {1972, 12, 31}, // +2 seconds in 1972
435+
{1973, 12, 31}, // +1 second in 1973
436+
{1974, 12, 31}, // +1 second in 1974
437+
{1975, 12, 31}, // +1 second in 1975
438+
{1976, 12, 31}, // +1 second in 1976
439+
{1977, 12, 31}, // +1 second in 1977
440+
{1978, 12, 31}, // +1 second in 1978
441+
{1979, 12, 31}, // +1 second in 1979
442+
{1981, 6, 30}, // +1 second in 1981
443+
{1982, 6, 30}, // +1 second in 1982
444+
{1983, 6, 30}, // +1 second in 1983
445+
{1985, 6, 30}, // +1 second in 1985
446+
{1987, 12, 31}, // +1 second in 1987
447+
{1989, 12, 31}, // +1 second in 1989
448+
{1990, 12, 31}, // +1 second in 1990
449+
{1992, 6, 30}, // +1 second in 1992
450+
{1993, 6, 30}, // +1 second in 1993
451+
{1994, 6, 30}, // +1 second in 1994
452+
{1995, 12, 31}, // +1 second in 1995
453+
{1997, 6, 30}, // +1 second in 1997
454+
{1998, 12, 31}, // +1 second in 1998
455+
{2005, 12, 31}, // +1 second in 2005
456+
{2008, 12, 31}, // +1 second in 2008
457+
{2012, 6, 30}, // +1 second in 2012
458+
{2015, 6, 30}, // +1 second in 2015
459+
{2016, 12, 31} // +1 second in 2016
460+
};
461+
462+
int leap_second_count = sizeof(leap_seconds) / sizeof(leap_seconds[0]);
463+
int leap_seconds_to_add = 0;
464+
465+
for (int i = 0; i < leap_second_count; i++) {
466+
// If the date is after this leap second, add one second
467+
if (
468+
(year > leap_seconds[i].year) ||
469+
(year == leap_seconds[i].year && month > leap_seconds[i].month) ||
470+
(year == leap_seconds[i].year && month == leap_seconds[i].month && day > leap_seconds[i].day)
471+
) { leap_seconds_to_add++; }
472+
}
473+
msecs_since_epoch += leap_seconds_to_add * 1000.0;
474+
475+
readstat_value_t value = {
476+
.type = READSTAT_TYPE_DOUBLE,
477+
.v = { .double_value = msecs_since_epoch }
478+
};
479+
480+
return value;
481+
}
482+
388483
void produce_csv_value_dta(void *csv_metadata, const char *s, size_t len) {
389484
struct csv_metadata *c = (struct csv_metadata *)csv_metadata;
390485
readstat_variable_t *var = &c->variables[c->columns];
391486
int is_date = c->is_date[c->columns];
487+
int is_date_time = c->is_date_time[c->columns];
392488
int obs_index = c->rows - 1; // TODO: ???
393489
readstat_value_t value;
394490

395491
if (len == 0) {
396492
value = value_sysmiss(s, len, c);
397493
} else if (is_date) {
398494
value = value_int32_date_dta(s, len, c);
495+
} else if (is_date_time) {
496+
value = value_double_date_time_dta(s, len, c);
399497
} else if (var->type == READSTAT_TYPE_DOUBLE) {
400498
value = value_double_dta(s, len, c);
401499
} else if (var->type == READSTAT_TYPE_STRING) {

src/bin/read_csv/read_csv.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ static void produce_column_header(struct csv_metadata *c, void *s, size_t len) {
4646

4747
extract_metadata_format_t colformat = column_format(c->json_md, column);
4848
c->is_date[c->columns] = colformat == EXTRACT_METADATA_FORMAT_DATE;
49+
c->is_date_time[c->columns] = colformat == EXTRACT_METADATA_FORMAT_DATE_TIME;
4950
if (c->output_module->header) {
5051
c->output_module->header(c, column, var);
5152
}
@@ -75,6 +76,7 @@ static void csv_metadata_cell(void *s, size_t len, void *data)
7576
if (c->rows == 0) {
7677
c->variables = realloc(c->variables, (c->columns+1) * sizeof(readstat_variable_t));
7778
c->is_date = realloc(c->is_date, (c->columns+1) * sizeof(int));
79+
c->is_date_time = realloc(c->is_date_time, (c->columns+1) * sizeof(int));
7880
produce_column_header(c, s, len);
7981
} else if (c->rows >= 1 && c->handle.value && c->output_module->csv_value) {
8082
c->output_module->csv_value(c, s, len);
@@ -184,6 +186,10 @@ readstat_error_t readstat_parse_csv(readstat_parser_t *parser,
184186
free(md->is_date);
185187
md->is_date = NULL;
186188
}
189+
if (md->is_date_time) {
190+
free(md->is_date_time);
191+
md->is_date_time = NULL;
192+
}
187193
csv_free(p);
188194
io->close(io->io_ctx);
189195
return retval;

0 commit comments

Comments
 (0)