Skip to content

Commit 6e94c68

Browse files
author
Mark Whitaker
authored
Support unicode letters (#20)
* Updated letter methods to support Unicode * Added tests for unicode letters * Added static imports of Assert methods
1 parent 1dd6b26 commit 6e94c68

File tree

3 files changed

+1905
-1716
lines changed

3 files changed

+1905
-1716
lines changed

src/main/java/uk/co/mainwave/regextoolbox/RegexBuilder.java

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ public RegexBuilder nonDigit(final RegexQuantifier quantifier) {
357357
}
358358

359359
/**
360-
* Add an element to match any letter in the Roman alphabet (a-z, A-Z)
360+
* Add an element to match any Unicode letter
361361
*
362362
* @return The current {@link RegexBuilder} object, for method chaining
363363
*/
@@ -366,19 +366,19 @@ public RegexBuilder letter() {
366366
}
367367

368368
/**
369-
* Add an element to match any letter in the Roman alphabet (a-z, A-Z)
369+
* Add an element to match any Unicode letter
370370
*
371371
* @param quantifier Quantifier to apply to this element
372372
* @return The current {@link RegexBuilder} object, for method chaining
373373
*/
374374
public RegexBuilder letter(final RegexQuantifier quantifier) {
375-
stringBuilder.append("[a-zA-Z]");
375+
stringBuilder.append("\\p{L}");
376376
addQuantifier(quantifier);
377377
return this;
378378
}
379379

380380
/**
381-
* Add an element to match any character that is not a letter in the Roman alphabet (a-z, A-Z)
381+
* Add an element to match any character that is not a Unicode letter
382382
*
383383
* @return The current {@link RegexBuilder} object, for method chaining
384384
*/
@@ -387,19 +387,19 @@ public RegexBuilder nonLetter() {
387387
}
388388

389389
/**
390-
* Add an element to match any character that is not a letter in the Roman alphabet (a-z, A-Z)
390+
* Add an element to match any character that is not a Unicode letter
391391
*
392392
* @param quantifier Quantifier to apply to this element
393393
* @return The current {@link RegexBuilder} object, for method chaining
394394
*/
395395
public RegexBuilder nonLetter(final RegexQuantifier quantifier) {
396-
stringBuilder.append("[^a-zA-Z]");
396+
stringBuilder.append("\\P{L}");
397397
addQuantifier(quantifier);
398398
return this;
399399
}
400400

401401
/**
402-
* Add an element to match any upper-case letter in the Roman alphabet (A-Z).
402+
* Add an element to match any upper-case Unicode letter
403403
*
404404
* @return The current {@link RegexBuilder} object, for method chaining
405405
*/
@@ -408,19 +408,19 @@ public RegexBuilder uppercaseLetter() {
408408
}
409409

410410
/**
411-
* Add an element to match any upper-case letter in the Roman alphabet (A-Z).
411+
* Add an element to match any upper-case Unicode letter
412412
*
413413
* @param quantifier Quantifier to apply to this element
414414
* @return The current {@link RegexBuilder} object, for method chaining
415415
*/
416416
public RegexBuilder uppercaseLetter(final RegexQuantifier quantifier) {
417-
stringBuilder.append("[A-Z]");
417+
stringBuilder.append("\\p{Lu}");
418418
addQuantifier(quantifier);
419419
return this;
420420
}
421421

422422
/**
423-
* Add an element to match any lowercase letter in the Roman alphabet (a-z)
423+
* Add an element to match any lowercase Unicode letter
424424
*
425425
* @return The current {@link RegexBuilder} object, for method chaining
426426
*/
@@ -429,19 +429,19 @@ public RegexBuilder lowercaseLetter() {
429429
}
430430

431431
/**
432-
* Add an element to match any lowercase letter in the Roman alphabet (a-z)
432+
* Add an element to match any lowercase Unicode letter
433433
*
434434
* @param quantifier Quantifier to apply to this element
435435
* @return The current {@link RegexBuilder} object, for method chaining
436436
*/
437437
public RegexBuilder lowercaseLetter(final RegexQuantifier quantifier) {
438-
stringBuilder.append("[a-z]");
438+
stringBuilder.append("\\p{Ll}");
439439
addQuantifier(quantifier);
440440
return this;
441441
}
442442

443443
/**
444-
* Add an element to match any letter in the Roman alphabet or decimal digit (a-z, A-Z, 0-9)
444+
* Add an element to match any Unicode letter or decimal digit
445445
*
446446
* @return The current {@link RegexBuilder} object, for method chaining
447447
*/
@@ -450,19 +450,19 @@ public RegexBuilder letterOrDigit() {
450450
}
451451

452452
/**
453-
* Add an element to match any letter in the Roman alphabet or decimal digit (a-z, A-Z, 0-9)
453+
* Add an element to match any Unicode letter or decimal digit
454454
*
455455
* @param quantifier Quantifier to apply to this element
456456
* @return The current {@link RegexBuilder} object, for method chaining
457457
*/
458458
public RegexBuilder letterOrDigit(final RegexQuantifier quantifier) {
459-
stringBuilder.append("[a-zA-Z0-9]");
459+
stringBuilder.append("[\\p{L}0-9]");
460460
addQuantifier(quantifier);
461461
return this;
462462
}
463463

464464
/**
465-
* Add an element to match any character that is not letter in the Roman alphabet or a decimal digit (a-z, A-Z, 0-9)
465+
* Add an element to match any character that is not a Unicode letter or a decimal digit
466466
*
467467
* @return The current {@link RegexBuilder} object, for method chaining
468468
*/
@@ -471,13 +471,13 @@ public RegexBuilder nonLetterOrDigit() {
471471
}
472472

473473
/**
474-
* Add an element to match any character that is not letter in the Roman alphabet or a decimal digit (a-z, A-Z, 0-9)
474+
* Add an element to match any character that is not a Unicode letter or a decimal digit
475475
*
476476
* @param quantifier Quantifier to apply to this element
477477
* @return The current {@link RegexBuilder} object, for method chaining
478478
*/
479479
public RegexBuilder nonLetterOrDigit(final RegexQuantifier quantifier) {
480-
stringBuilder.append("[^a-zA-Z0-9]");
480+
stringBuilder.append("[^\\p{L}0-9]");
481481
addQuantifier(quantifier);
482482
return this;
483483
}
@@ -567,7 +567,7 @@ public RegexBuilder nonHexDigit(final RegexQuantifier quantifier) {
567567
}
568568

569569
/**
570-
* Add an element to match any Roman alphabet letter, decimal digit, or underscore (a-z, A-Z, 0-9, _)
570+
* Add an element to match any Unicode letter, decimal digit, or underscore
571571
*
572572
* @return The current {@link RegexBuilder} object, for method chaining
573573
*/
@@ -576,20 +576,19 @@ public RegexBuilder wordCharacter() {
576576
}
577577

578578
/**
579-
* Add an element to match any Roman alphabet letter, decimal digit, or underscore (a-z, A-Z, 0-9, _)
579+
* Add an element to match any Unicode letter, decimal digit, or underscore
580580
*
581581
* @param quantifier Quantifier to apply to this element
582582
* @return The current {@link RegexBuilder} object, for method chaining
583583
*/
584584
public RegexBuilder wordCharacter(final RegexQuantifier quantifier) {
585-
stringBuilder.append("\\w");
585+
stringBuilder.append("[\\p{L}0-9_]");
586586
addQuantifier(quantifier);
587587
return this;
588588
}
589589

590590
/**
591-
* Add an element to match any character that is not a Roman alphabet letter, decimal digit, or underscore
592-
* (a-z, A-Z, 0-9, _)
591+
* Add an element to match any character that is not a Unicode letter, decimal digit, or underscore
593592
*
594593
* @return The current {@link RegexBuilder} object, for method chaining
595594
*/
@@ -598,14 +597,13 @@ public RegexBuilder nonWordCharacter() {
598597
}
599598

600599
/**
601-
* Add an element to match any character that is not a Roman alphabet letter, decimal digit, or underscore
602-
* (a-z, A-Z, 0-9, _)
600+
* Add an element to match any character that is not a Unicode letter, decimal digit, or underscore
603601
*
604602
* @param quantifier Quantifier to apply to this element
605603
* @return The current {@link RegexBuilder} object, for method chaining
606604
*/
607605
public RegexBuilder nonWordCharacter(final RegexQuantifier quantifier) {
608-
stringBuilder.append("\\W");
606+
stringBuilder.append("[^\\p{L}0-9_]");
609607
addQuantifier(quantifier);
610608
return this;
611609
}

0 commit comments

Comments
 (0)