Skip to content
This repository was archived by the owner on Aug 29, 2025. It is now read-only.

Commit d327a58

Browse files
committed
Fixed bug in compute_multibyte_table() with codepoints spanning exactly 2 bytes
Optimized get_num_resulting_bytes() Removed unnecessary cast to uint8_t in get_indices()
1 parent 499e290 commit d327a58

File tree

2 files changed

+27
-24
lines changed

2 files changed

+27
-24
lines changed

include/tinyutf8.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -398,16 +398,19 @@ class utf8_string
398398
//! Get the nth index within a multibyte index table
399399
static inline size_type get_nth_index( const void* table , unsigned char table_element_size , size_type idx ){
400400
switch( table_element_size ){
401-
case sizeof(std::uint8_t): return static_cast<const std::uint8_t*>(table)[idx];
401+
case sizeof(std::uint8_t): return static_cast<const std::uint8_t*> (table)[idx];
402402
case sizeof(std::uint16_t): return static_cast<const std::uint16_t*>(table)[idx];
403+
case sizeof(size_type):
404+
default: return static_cast<const size_type*> (table)[idx];
403405
}
404406
return static_cast<const size_type*>(table)[idx];
405407
}
406408
static inline void set_nth_index( void* table , unsigned char table_element_size , size_type idx , size_type value ){
407409
switch( table_element_size ){
408-
case sizeof(std::uint8_t): static_cast<std::uint8_t*>(table)[idx] = value; break;
410+
case sizeof(std::uint8_t): static_cast<std::uint8_t*> (table)[idx] = value; break;
409411
case sizeof(std::uint16_t): static_cast<std::uint16_t*>(table)[idx] = value; break;
410-
case sizeof(size_type): static_cast<size_type*>(table)[idx] = value; break;
412+
case sizeof(size_type):
413+
default: static_cast<size_type*> (table)[idx] = value; break;
411414
}
412415
}
413416

@@ -436,11 +439,9 @@ class utf8_string
436439
const void* get_indices() const {
437440
if( this->sso_active() || !this->_indices_len )
438441
return nullptr;
439-
return reinterpret_cast<std::uint8_t*>(
440-
this->_buffer
441-
+ this->_capacity // Go to end of buffer
442+
return this->_buffer + this->_capacity // Go to end of buffer
442443
- ( this->_indices_len * get_index_datatype_bytes( this->_buffer_len ) ) // Subtract the number of bytes that the indices table occupies
443-
);
444+
;
444445
}
445446
void* get_indices(){
446447
return const_cast<void*>( static_cast<const utf8_string*>(this)->get_indices() );

lib/tinyutf8.cpp

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -556,26 +556,24 @@ utf8_string::size_type utf8_string::get_num_resulting_bytes( size_type start_byt
556556
bool misformatted;
557557
bool* check_misformatted = this->_misformatted ? &misformatted : nullptr;
558558
size_type cur_byte = start_byte;
559+
size_type buffer_len = size + 1;
559560

560561
// Reduce the byte count by the number of utf8 data bytes
561-
if( this->sso_active() ){
562+
if( utf8_string::is_small_string(buffer_len) ){ // this->sso_active(), but we have already cached size()
562563
while( codepoint_count-- > 0 && cur_byte < size )
563-
cur_byte += get_num_bytes_of_utf8_char( buffer , cur_byte , this->_buffer_len , check_misformatted );
564-
return cur_byte - start_byte;
564+
cur_byte += get_num_bytes_of_utf8_char( buffer , cur_byte , buffer_len , check_misformatted );
565565
}
566-
567-
// Add at least as many bytes as codepoints
568-
cur_byte += codepoint_count;
569-
570-
if( size_type indices_len = this->_indices_len )
566+
else if( size_type indices_len = this->_indices_len )
571567
{
568+
// Add at least as many bytes as codepoints
569+
cur_byte += codepoint_count;
570+
572571
size_type index_multibyte_table = 0;
573572
const void* indices = this->get_indices();
574-
unsigned char indices_datatype_bytes = get_index_datatype_bytes( this->_buffer_len );
573+
unsigned char indices_datatype_bytes = get_index_datatype_bytes(buffer_len);
575574

576575
// Iterate to the start of the relevant part of the multibyte table
577-
while( index_multibyte_table < indices_len )
578-
{
576+
while( index_multibyte_table < indices_len ){
579577
if( utf8_string::get_nth_index( indices , indices_datatype_bytes , index_multibyte_table ) >= start_byte )
580578
break;
581579
index_multibyte_table++;
@@ -592,9 +590,11 @@ utf8_string::size_type utf8_string::get_num_resulting_bytes( size_type start_byt
592590
index_multibyte_table++;
593591

594592
// Add the utf8 data bytes to the number of bytes
595-
cur_byte += get_num_bytes_of_utf8_char( buffer , multibyte_pos , this->_buffer_len , check_misformatted ) - 1; // Add utf8 data bytes
593+
cur_byte += get_num_bytes_of_utf8_char( buffer , multibyte_pos , buffer_len , check_misformatted ) - 1; // Add utf8 data bytes
596594
}
597595
}
596+
else
597+
return codepoint_count;
598598

599599
return cur_byte - start_byte;
600600
}
@@ -693,9 +693,9 @@ void utf8_string::compute_multibyte_table( void* table , bool* misformatted )
693693
unsigned char indices_datatype_bytes = get_index_datatype_bytes( buffer_len );
694694

695695
// Fill Multibyte Table
696-
for( size_type index = 0 ; index < buffer_len ; index++ )
696+
for( size_type index = 0 ; index < buffer_len ; )
697697
{
698-
unsigned char cur_num_bytes = get_num_bytes_of_utf8_char( buffer , index , buffer_len , misformatted ) - 1;
698+
unsigned char cur_num_bytes = get_num_bytes_of_utf8_char( buffer , index , buffer_len , misformatted );
699699
if( cur_num_bytes > 1 )
700700
utf8_string::set_nth_index( table , indices_datatype_bytes , multibyte_index++ , index );
701701
index += cur_num_bytes;
@@ -1137,12 +1137,14 @@ utf8_string::value_type utf8_string::at( size_type requested_index ) const
11371137
if( requested_index >= size() )
11381138
return 0;
11391139

1140-
if( !this->sso_active() && !requires_unicode() )
1141-
return (value_type) this->_buffer[requested_index];
1140+
const char* buffer = this->get_buffer();
1141+
1142+
if( !requires_unicode() )
1143+
return (value_type) buffer[requested_index];
11421144

11431145
// Decode internal buffer at position n
11441146
value_type codepoint = 0;
1145-
decode_utf8( this->get_buffer() + get_num_resulting_bytes( 0 , requested_index ) , codepoint , this->_misformatted );
1147+
decode_utf8( buffer + get_num_resulting_bytes( 0 , requested_index ) , codepoint , this->_misformatted );
11461148

11471149
return codepoint;
11481150
}

0 commit comments

Comments
 (0)