Skip to content

[libc] wcstok implementation #145989

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 28 additions & 24 deletions libc/src/wchar/wcstok.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,38 @@

namespace LIBC_NAMESPACE_DECL {

bool isADelimeter(wchar_t wc, const wchar_t *delimiters) {
for (const wchar_t *delim_ptr = delimiters; *delim_ptr != L'\0'; delim_ptr++)
if (wc == *delim_ptr)
return true;
return false;
}

LLVM_LIBC_FUNCTION(wchar_t *, wcstok,
(wchar_t *__restrict str, const wchar_t *__restrict delim,
wchar_t **__restrict ptr)) {
if (str == nullptr)
str = *ptr;

bool foundTokenStart = false;
wchar_t *out = nullptr;
wchar_t *str_ptr;
for (str_ptr = str; *str_ptr != L'\0'; str_ptr++) {
bool inDelim = false;
for (const wchar_t *delim_ptr = delim; *delim_ptr != L'\0' && !inDelim;
delim_ptr++)
if (*str_ptr == *delim_ptr)
inDelim = true;

if (!inDelim && !foundTokenStart) {
foundTokenStart = true;
out = str_ptr;
} else if (inDelim && foundTokenStart) {
*str_ptr = L'\0';
*ptr = str_ptr + 1;
return out;
}
wchar_t **__restrict context)) {
if (str == nullptr) {
if (*context == nullptr)
return nullptr;

str = *context;
}

*ptr = str_ptr;
return out;
wchar_t *tok_start, *tok_end;
for (tok_start = str; *tok_start != L'\0' && isADelimeter(*tok_start, delim);
tok_start++)
;

for (tok_end = tok_start; *tok_end != L'\0' && !isADelimeter(*tok_end, delim);
tok_end++)
;

if (*tok_end != L'\0') {
*tok_end = L'\0';
tok_end++;
}
*context = tok_end;
return *tok_start == L'\0' ? nullptr : tok_start;
}

} // namespace LIBC_NAMESPACE_DECL
2 changes: 1 addition & 1 deletion libc/src/wchar/wcstok.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
namespace LIBC_NAMESPACE_DECL {

wchar_t *wcstok(wchar_t *__restrict str, const wchar_t *__restrict delim,
wchar_t **__restrict ptr);
wchar_t **__restrict context);

} // namespace LIBC_NAMESPACE_DECL

Expand Down
204 changes: 120 additions & 84 deletions libc/test/src/wchar/wcstok_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,135 +11,171 @@
#include "src/wchar/wcstok.h"
#include "test/UnitTest/Test.h"

TEST(LlvmLibcStrTokTest, NoTokenFound) {
wchar_t empty[] = L"";
wchar_t *buf;
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"", &buf), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"_", &buf), nullptr);

wchar_t single[] = L"_";
wchar_t *token = LIBC_NAMESPACE::wcstok(single, L"", &buf);
ASSERT_TRUE(token[0] == L'_');
ASSERT_TRUE(token[1] == L'\0');

wchar_t multiple[] = L"1,2";
token = LIBC_NAMESPACE::wcstok(multiple, L":", &buf);
ASSERT_TRUE(multiple[0] == L'1');
ASSERT_TRUE(multiple[1] == L',');
ASSERT_TRUE(multiple[2] == L'2');
ASSERT_TRUE(multiple[3] == L'\0');
TEST(LlvmLibcWCSTokReentrantTest, NoTokenFound) {
{ // Empty source and delimiter string.
wchar_t empty[] = L"";
wchar_t *reserve = nullptr;
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"", &reserve), nullptr);
// Another call to ensure that 'reserve' is not in a bad state.
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"", &reserve), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"", &reserve), nullptr);
}
{ // Empty source and single character delimiter string.
wchar_t empty[] = L"";
wchar_t *reserve = nullptr;
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"_", &reserve), nullptr);
// Another call to ensure that 'reserve' is not in a bad state.
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"_", &reserve), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"_", &reserve), nullptr);
}
{ // Same wchar_tacter source and delimiter string.
wchar_t single[] = L"_";
wchar_t *reserve = nullptr;
ASSERT_EQ(LIBC_NAMESPACE::wcstok(single, L"_", &reserve), nullptr);
// Another call to ensure that 'reserve' is not in a bad state.
ASSERT_EQ(LIBC_NAMESPACE::wcstok(single, L"_", &reserve), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"_", &reserve), nullptr);
}
{ // Multiple wchar_tacter source and single wchar_tacter delimiter string.
wchar_t multiple[] = L"1,2";
wchar_t *reserve = nullptr;
wchar_t *tok = LIBC_NAMESPACE::wcstok(multiple, L":", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L',');
ASSERT_TRUE(tok[2] == L'2');
ASSERT_TRUE(tok[3] == L'\0');
// Another call to ensure that 'reserve' is not in a bad state.
tok = LIBC_NAMESPACE::wcstok(multiple, L":", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L',');
ASSERT_TRUE(tok[2] == L'2');
ASSERT_TRUE(tok[3] == L'\0');
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L":", &reserve), nullptr);
}
}

TEST(LlvmLibcStrTokTest, DelimiterAsFirstCharacterShouldBeIgnored) {
wchar_t *buf;
TEST(LlvmLibcWCSTokReentrantTest, DelimiterAsFirstCharacterShouldBeIgnored) {
wchar_t src[] = L".123";
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L".", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'3');
ASSERT_TRUE(token[3] == L'\0');
wchar_t *reserve = nullptr;
wchar_t *tok = LIBC_NAMESPACE::wcstok(src, L".", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'3');
ASSERT_TRUE(tok[3] == L'\0');
// Another call to ensure that 'reserve' is not in a bad state.
tok = LIBC_NAMESPACE::wcstok(src, L".", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'3');
ASSERT_TRUE(tok[3] == L'\0');
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L".", &reserve), nullptr);
}

TEST(LlvmLibcStrTokTest, DelimiterIsMiddleCharacter) {
TEST(LlvmLibcWCSTokReentrantTest, DelimiterIsMiddleCharacter) {
wchar_t src[] = L"12,34";
wchar_t *buf;
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L",", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'\0');
wchar_t *reserve = nullptr;
wchar_t *tok = LIBC_NAMESPACE::wcstok(src, L",", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'\0');
// Another call to ensure that 'reserve' is not in a bad state.
tok = LIBC_NAMESPACE::wcstok(src, L",", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'\0');
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L",", &reserve), nullptr);
}

TEST(LlvmLibcStrTokTest, DelimiterAsLastCharacterShouldBeIgnored) {
TEST(LlvmLibcWCSTokReentrantTest, DelimiterAsLastCharacterShouldBeIgnored) {
wchar_t src[] = L"1234:";
wchar_t *buf;
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L":", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'3');
ASSERT_TRUE(token[3] == L'4');
ASSERT_TRUE(token[4] == L'\0');
wchar_t *reserve = nullptr;
wchar_t *tok = LIBC_NAMESPACE::wcstok(src, L":", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'3');
ASSERT_TRUE(tok[3] == L'4');
ASSERT_TRUE(tok[4] == L'\0');
// Another call to ensure that 'reserve' is not in a bad state.
tok = LIBC_NAMESPACE::wcstok(src, L":", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'3');
ASSERT_TRUE(tok[3] == L'4');
ASSERT_TRUE(tok[4] == L'\0');
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L":", &reserve), nullptr);
}

TEST(LlvmLibcStrTokTest, MultipleDelimiters) {
wchar_t src[] = L"12,.34";
wchar_t *buf;
wchar_t *token;

token = LIBC_NAMESPACE::wcstok(src, L".", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L',');
ASSERT_TRUE(token[3] == L'\0');

token = LIBC_NAMESPACE::wcstok(src, L".,", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(src, L",.", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(src, L":,.", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'\0');
TEST(LlvmLibcWCSTokReentrantTest, ShouldNotGoPastNullTerminator) {
wchar_t src[] = {L'1', L'2', L'\0', L',', L'3'};
wchar_t *reserve = nullptr;
wchar_t *tok = LIBC_NAMESPACE::wcstok(src, L",", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'\0');
// Another call to ensure that 'reserve' is not in a bad state.
tok = LIBC_NAMESPACE::wcstok(src, L",", &reserve);
ASSERT_TRUE(tok[0] == L'1');
ASSERT_TRUE(tok[1] == L'2');
ASSERT_TRUE(tok[2] == L'\0');
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L",", &reserve), nullptr);
}

TEST(LlvmLibcStrTokTest, ShouldNotGoPastNullTerminator) {
wchar_t src[] = {L'1', L'2', L'\0', L',', L'3'};
wchar_t *buf;
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L",", &buf);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'\0');
TEST(LlvmLibcWCSTokReentrantTest,
ShouldReturnNullptrWhenBothSrcAndSaveptrAreNull) {
wchar_t *src = nullptr;
wchar_t *reserve = nullptr;
// Ensure that instead of crashing if src and reserve are null, nullptr is
// returned
ASSERT_EQ(LIBC_NAMESPACE::wcstok(src, L",", &reserve), nullptr);
// And that neither src nor reserve are changed when that happens
ASSERT_EQ(src, nullptr);
ASSERT_EQ(reserve, nullptr);
}

TEST(LlvmLibcStrTokTest, SubsequentCallsShouldFindFollowingDelimiters) {
TEST(LlvmLibcWCSTokReentrantTest,
SubsequentCallsShouldFindFollowingDelimiters) {
wchar_t src[] = L"12,34.56";
wchar_t *buf;
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L",.", &buf);
wchar_t *reserve = nullptr;
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L",.", &reserve);
ASSERT_TRUE(token[0] == L'1');
ASSERT_TRUE(token[1] == L'2');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(nullptr, L",.", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L",.", &reserve);
ASSERT_TRUE(token[0] == L'3');
ASSERT_TRUE(token[1] == L'4');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(nullptr, L",.", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L",.", &reserve);
ASSERT_TRUE(token[0] == L'5');
ASSERT_TRUE(token[1] == L'6');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,_", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,_", &reserve);
ASSERT_EQ(token, nullptr);
// Subsequent calls after hitting the end of the string should also return
// nullptr.
token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,_", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,_", &reserve);
ASSERT_EQ(token, nullptr);
}

TEST(LlvmLibcStrTokTest, DelimitersShouldNotBeIncludedInToken) {
wchar_t *buf;
TEST(LlvmLibcWCSTokReentrantTest, DelimitersShouldNotBeIncludedInToken) {
wchar_t src[] = L"__ab__:_cd__:__ef__:__";
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L"_:", &buf);
wchar_t *reserve = nullptr;
wchar_t *token = LIBC_NAMESPACE::wcstok(src, L"_:", &reserve);
ASSERT_TRUE(token[0] == L'a');
ASSERT_TRUE(token[1] == L'b');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(nullptr, L":_", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L":_", &reserve);
ASSERT_TRUE(token[0] == L'c');
ASSERT_TRUE(token[1] == L'd');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,", &reserve);
ASSERT_TRUE(token[0] == L'e');
ASSERT_TRUE(token[1] == L'f');
ASSERT_TRUE(token[2] == L'\0');

token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,_", &buf);
token = LIBC_NAMESPACE::wcstok(nullptr, L"_:,_", &reserve);
ASSERT_EQ(token, nullptr);
}
Loading