Skip to content

Commit 1a2f57b

Browse files
taku910hiroyuki-komatsu
authored andcommitted
Introduced InnerSegments, InnerSegmentBoundary class
This utility class is introduced to provide more consistent access to the inner segment information, while hiding the internal encoding schema, allowing more flexible implementation. - Provides a modern range-based loop-based iterator. - Provides a builder class for inner boundary to encapsulate the encoding algorithm and error-prone data generation. - Automatic fallback when invalid or empty boundary information is passed. - Provides more consistent behavior for content_(key|value) and inner segment boundaries. PiperOrigin-RevId: 790999040
1 parent 45b9cb2 commit 1a2f57b

35 files changed

+900
-494
lines changed

src/converter/BUILD.bazel

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,14 @@ mozc_cc_library(
5858
],
5959
deps = [
6060
":attribute",
61+
":inner_segment",
6162
":lattice",
6263
"//base:number_util",
6364
"//base:util",
6465
"//base:vlog",
6566
"//base/container:freelist",
6667
"//base/strings:assign",
68+
"@com_google_absl//absl/base:core_headers",
6769
"@com_google_absl//absl/log",
6870
"@com_google_absl//absl/log:check",
6971
"@com_google_absl//absl/strings",
@@ -91,6 +93,7 @@ mozc_cc_test(
9193
size = "small",
9294
srcs = ["candidate_test.cc"],
9395
deps = [
96+
":inner_segment",
9497
":segments",
9598
"//base:number_util",
9699
"//testing:gunit_main",
@@ -108,6 +111,41 @@ mozc_cc_library(
108111
],
109112
)
110113

114+
mozc_cc_library(
115+
name = "inner_segment",
116+
hdrs = [
117+
"inner_segment.h",
118+
],
119+
visibility = [
120+
"//engine:__pkg__",
121+
"//prediction:__pkg__",
122+
"//request:__pkg__",
123+
"//rewriter:__pkg__",
124+
],
125+
deps = [
126+
"@com_google_absl//absl/container:fixed_array",
127+
"@com_google_absl//absl/log:check",
128+
"@com_google_absl//absl/strings",
129+
"@com_google_absl//absl/strings:str_format",
130+
"@com_google_absl//absl/types:span",
131+
],
132+
)
133+
134+
mozc_cc_test(
135+
name = "inner_segment_test",
136+
srcs = [
137+
"inner_segment_test.cc",
138+
],
139+
deps = [
140+
":inner_segment",
141+
"//testing:gunit_main",
142+
"@com_google_absl//absl/log",
143+
"@com_google_absl//absl/log:check",
144+
"@com_google_absl//absl/strings",
145+
"@com_google_absl//absl/types:span",
146+
],
147+
)
148+
111149
mozc_cc_library(
112150
name = "segments_matchers",
113151
testonly = 1,
@@ -223,6 +261,7 @@ mozc_cc_library(
223261
deps = [
224262
":candidate_filter",
225263
":connector",
264+
":inner_segment",
226265
":lattice",
227266
":node",
228267
":segmenter",
@@ -423,6 +462,7 @@ mozc_cc_test(
423462
],
424463
deps = [
425464
":immutable_converter_no_factory",
465+
":inner_segment",
426466
":lattice",
427467
":node",
428468
":segments",
@@ -475,6 +515,7 @@ mozc_cc_library(
475515
":converter_interface",
476516
":history_reconstructor",
477517
":immutable_converter_interface",
518+
":inner_segment",
478519
":reverse_converter",
479520
":segments",
480521
"//base:util",
@@ -509,6 +550,7 @@ mozc_cc_test(
509550
":converter_interface",
510551
":immutable_converter_interface",
511552
":immutable_converter_no_factory",
553+
":inner_segment",
512554
":segments",
513555
":segments_matchers",
514556
"//base:util",

src/converter/candidate.cc

Lines changed: 5 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "absl/strings/string_view.h"
4545
#include "absl/types/span.h"
4646
#include "base/number_util.h"
47+
#include "converter/inner_segment.h"
4748

4849
#ifdef MOZC_CANDIDATE_DEBUG
4950
#include "absl/strings/str_cat.h"
@@ -85,54 +86,6 @@ void Candidate::Dlog(absl::string_view filename, int line,
8586
}
8687
#endif // MOZC_CANDIDATE_DEBUG
8788

88-
bool Candidate::IsValid() const {
89-
if (inner_segment_boundary.empty()) {
90-
return true;
91-
}
92-
// The sums of the lengths of key, value components must coincide with those
93-
// of key, value, respectively.
94-
size_t sum_key_len = 0, sum_value_len = 0;
95-
for (InnerSegmentIterator iter(this); !iter.Done(); iter.Next()) {
96-
sum_key_len += iter.GetKey().size();
97-
sum_value_len += iter.GetValue().size();
98-
}
99-
return sum_key_len == key.size() && sum_value_len == value.size();
100-
}
101-
102-
bool Candidate::EncodeLengths(size_t key_len, size_t value_len,
103-
size_t content_key_len, size_t content_value_len,
104-
uint32_t *result) {
105-
if (key_len > std::numeric_limits<uint8_t>::max() ||
106-
value_len > std::numeric_limits<uint8_t>::max() ||
107-
content_key_len > std::numeric_limits<uint8_t>::max() ||
108-
content_value_len > std::numeric_limits<uint8_t>::max()) {
109-
return false;
110-
}
111-
*result = (static_cast<uint32_t>(key_len) << 24) |
112-
(static_cast<uint32_t>(value_len) << 16) |
113-
(static_cast<uint32_t>(content_key_len) << 8) |
114-
static_cast<uint32_t>(content_value_len);
115-
return true;
116-
}
117-
118-
std::tuple<size_t, size_t, size_t, size_t> Candidate::DecodeLengths(
119-
uint32_t encoded) {
120-
return std::make_tuple(encoded >> 24, (encoded >> 16) & 0xff,
121-
(encoded >> 8) & 0xff, (encoded & 0xff));
122-
}
123-
124-
bool Candidate::PushBackInnerSegmentBoundary(size_t key_len, size_t value_len,
125-
size_t content_key_len,
126-
size_t content_value_len) {
127-
uint32_t encoded;
128-
if (EncodeLengths(key_len, value_len, content_key_len, content_value_len,
129-
&encoded)) {
130-
inner_segment_boundary.push_back(encoded);
131-
return true;
132-
}
133-
return false;
134-
}
135-
13689
std::string Candidate::DebugString() const {
13790
std::stringstream os;
13891
os << "(key=" << key << " ckey=" << content_key << " val=" << value
@@ -151,70 +104,15 @@ std::string Candidate::DebugString() const {
151104
}
152105
if (!inner_segment_boundary.empty()) {
153106
os << " segbdd=";
154-
for (size_t i = 0; i < inner_segment_boundary.size(); ++i) {
155-
const uint32_t encoded_lengths = inner_segment_boundary[i];
156-
const auto [key_len, value_len, content_key_len, content_value_len] =
157-
DecodeLengths(encoded_lengths);
158-
os << absl::StreamFormat("<%d,%d,%d,%d>", key_len, value_len,
159-
content_key_len, content_value_len);
107+
for (const auto &iter : inner_segments()) {
108+
os << absl::StreamFormat(
109+
"<%d,%d,%d,%d>", iter.GetKey().size(), iter.GetValue().size(),
110+
iter.GetContentKey().size(), iter.GetContentValue().size());
160111
}
161112
}
162113
os << ")" << std::endl;
163114
return os.str();
164115
}
165116

166-
void Candidate::InnerSegmentIterator::Next() {
167-
DCHECK_LT(index_, inner_segment_boundary_.size());
168-
const uint32_t encoded_lengths = inner_segment_boundary_[index_++];
169-
key_offset_ += encoded_lengths >> 24;
170-
value_offset_ += (encoded_lengths >> 16) & 0xff;
171-
}
172-
173-
absl::string_view Candidate::InnerSegmentIterator::GetKey() const {
174-
DCHECK_LT(index_, inner_segment_boundary_.size());
175-
const uint32_t encoded_lengths = inner_segment_boundary_[index_];
176-
return absl::string_view(key_offset_, encoded_lengths >> 24);
177-
}
178-
179-
absl::string_view Candidate::InnerSegmentIterator::GetValue() const {
180-
DCHECK_LT(index_, inner_segment_boundary_.size());
181-
const uint32_t encoded_lengths = inner_segment_boundary_[index_];
182-
return absl::string_view(value_offset_, (encoded_lengths >> 16) & 0xff);
183-
}
184-
185-
absl::string_view Candidate::InnerSegmentIterator::GetContentKey() const {
186-
DCHECK_LT(index_, inner_segment_boundary_.size());
187-
const uint32_t encoded_lengths = inner_segment_boundary_[index_];
188-
return absl::string_view(key_offset_, (encoded_lengths >> 8) & 0xff);
189-
}
190-
191-
absl::string_view Candidate::InnerSegmentIterator::GetContentValue() const {
192-
DCHECK_LT(index_, inner_segment_boundary_.size());
193-
const uint32_t encoded_lengths = inner_segment_boundary_[index_];
194-
return absl::string_view(value_offset_, encoded_lengths & 0xff);
195-
}
196-
197-
absl::string_view Candidate::InnerSegmentIterator::GetFunctionalKey() const {
198-
DCHECK_LT(index_, inner_segment_boundary_.size());
199-
const uint32_t encoded_lengths = inner_segment_boundary_[index_];
200-
const int key_len = encoded_lengths >> 24;
201-
const int content_key_len = (encoded_lengths >> 8) & 0xff;
202-
if (const int key_size = key_len - content_key_len; key_size > 0) {
203-
return absl::string_view(key_offset_ + content_key_len, key_size);
204-
}
205-
return absl::string_view();
206-
}
207-
208-
absl::string_view Candidate::InnerSegmentIterator::GetFunctionalValue() const {
209-
DCHECK_LT(index_, inner_segment_boundary_.size());
210-
const uint32_t encoded_lengths = inner_segment_boundary_[index_];
211-
const int value_len = (encoded_lengths >> 16) & 0xff;
212-
const int content_value_len = encoded_lengths & 0xff;
213-
if (const int value_size = value_len - content_value_len; value_size > 0) {
214-
return absl::string_view(value_offset_ + content_value_len, value_size);
215-
}
216-
return absl::string_view();
217-
}
218-
219117
} // namespace converter
220118
} // namespace mozc

src/converter/candidate.h

Lines changed: 11 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,12 @@
3737
#include <tuple>
3838
#include <vector>
3939

40+
#include "absl/base/attributes.h"
4041
#include "absl/strings/string_view.h"
4142
#include "absl/types/span.h"
4243
#include "base/number_util.h"
4344
#include "converter/attribute.h"
45+
#include "converter/inner_segment.h"
4446

4547
#ifndef NDEBUG
4648
#define MOZC_CANDIDATE_DEBUG
@@ -137,7 +139,7 @@ class Candidate {
137139
// Boundary information for real time conversion. This will be set only for
138140
// real time conversion result candidates. Each element is the encoded
139141
// lengths of key, value, content key and content value.
140-
std::vector<uint32_t> inner_segment_boundary;
142+
InnerSegmentBoundary inner_segment_boundary;
141143
// LINT.ThenChange(//converter/segments_matchers.h)
142144

143145
// The original cost before rescoring. Used for debugging purpose.
@@ -148,68 +150,11 @@ class Candidate {
148150
mutable std::string log;
149151
#endif // MOZC_CANDIDATE_DEBUG
150152

151-
static bool EncodeLengths(size_t key_len, size_t value_len,
152-
size_t content_key_len, size_t content_value_len,
153-
uint32_t *result);
154-
155-
// This function ignores error, so be careful when using this.
156-
static uint32_t EncodeLengths(size_t key_len, size_t value_len,
157-
size_t content_key_len,
158-
size_t content_value_len) {
159-
uint32_t result;
160-
EncodeLengths(key_len, value_len, content_key_len, content_value_len,
161-
&result);
162-
return result;
153+
InnerSegments inner_segments() const {
154+
return InnerSegments(key, value, content_key, content_value,
155+
inner_segment_boundary);
163156
}
164157

165-
// returns [key_len, value_len, content_value_len, content_value_len]
166-
static std::tuple<size_t, size_t, size_t, size_t> DecodeLengths(
167-
uint32_t encoded);
168-
169-
// Inserts a new element to |inner_segment_boundary|. If one of four
170-
// lengths is longer than 255, this method returns false.
171-
bool PushBackInnerSegmentBoundary(size_t key_len, size_t value_len,
172-
size_t content_key_len,
173-
size_t content_value_len);
174-
175-
// Iterates inner segments. Usage example:
176-
// for (InnerSegmentIterator iter(&cand); !iter.Done(); iter.Next()) {
177-
// absl::string_view s = iter.GetContentKey();
178-
// ...
179-
// }
180-
class InnerSegmentIterator final {
181-
public:
182-
explicit InnerSegmentIterator(const Candidate *candidate)
183-
: inner_segment_boundary_(candidate->inner_segment_boundary),
184-
key_offset_(candidate->key.data()),
185-
value_offset_(candidate->value.data()),
186-
index_(0) {}
187-
188-
InnerSegmentIterator(absl::Span<const uint32_t> inner_segment_boundary,
189-
absl::string_view key, absl::string_view value)
190-
: inner_segment_boundary_(inner_segment_boundary),
191-
key_offset_(key.data()),
192-
value_offset_(value.data()),
193-
index_(0) {}
194-
195-
bool Done() const { return index_ == inner_segment_boundary_.size(); }
196-
197-
void Next();
198-
absl::string_view GetKey() const;
199-
absl::string_view GetValue() const;
200-
absl::string_view GetContentKey() const;
201-
absl::string_view GetContentValue() const;
202-
absl::string_view GetFunctionalKey() const;
203-
absl::string_view GetFunctionalValue() const;
204-
size_t GetIndex() const { return index_; }
205-
206-
private:
207-
const absl::Span<const uint32_t> inner_segment_boundary_;
208-
const char *key_offset_ = nullptr;
209-
const char *value_offset_ = nullptr;
210-
size_t index_ = 0;
211-
};
212-
213158
// Clears the Candidate with default values. Note that the default
214159
// constructor already does the same so you don't need to call Clear
215160
// explicitly.
@@ -225,18 +170,11 @@ class Candidate {
225170
// value.substr(content_value.size(), value.size() - content_value.size());
226171
absl::string_view functional_value() const;
227172

228-
// Returns whether the inner_segment_boundary member is consistent with
229-
// key and value.
230-
// Note: content_key and content_value are not checked here.
231-
// We cannot compose candidate's content_key and content_value directly
232-
// from the inner segments in the current implementation.
233-
// Example:
234-
// value: 車のほうがあとだ
235-
// content_value: 車のほうがあとだ
236-
// inner_segments:
237-
// <くるまのほうが, 車のほうが, くるま, 車>
238-
// <あとだ, あとだ, あとだ, あとだ>
239-
bool IsValid() const;
173+
// inner_segments() always returns valid information, so
174+
// IsValid() can return always true.
175+
// TODO(taku): Remove this method.
176+
ABSL_DEPRECATED("IsValid() always returns true.")
177+
bool IsValid() const { return true; }
240178
std::string DebugString() const;
241179

242180
friend std::ostream &operator<<(std::ostream &os,

0 commit comments

Comments
 (0)