Browse Source
- Add unicode_normalize.go with mappings for small caps and fraktur - Map 77 decorative unicode characters to ASCII equivalents: - Small caps (25 chars): ᴅᴇᴀᴛʜ → death - Fraktur lowercase (26 chars): 𝔡𝔢𝔞𝔱𝔥 → death - Fraktur uppercase (26 chars): 𝔇𝔈𝔄𝔗ℌ → death - Fix broken utf8DecodeRuneInString() that failed on multi-byte UTF-8 - Add migration v7 to rebuild word indexes with normalization - Add comprehensive unit tests for all character mappings Files modified: - pkg/database/unicode_normalize.go: New - character mapping tables - pkg/database/unicode_normalize_test.go: New - unit tests - pkg/database/tokenize.go: Integrate normalizeRune(), fix UTF-8 decoder - pkg/database/migrations.go: Add version 7 migration 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>main
5 changed files with 439 additions and 13 deletions
@ -0,0 +1,135 @@ |
|||||||
|
//go:build !(js && wasm)
|
||||||
|
|
||||||
|
package database |
||||||
|
|
||||||
|
// normalizeRune maps decorative unicode characters (small caps, fraktur) back to
|
||||||
|
// their ASCII equivalents for consistent word indexing. This ensures that text
|
||||||
|
// written with decorative alphabets (e.g., "ᴅᴇᴀᴛʜ" or "𝔇𝔢𝔞𝔱𝔥") indexes the same
|
||||||
|
// as regular ASCII ("death").
|
||||||
|
//
|
||||||
|
// Character sets normalized:
|
||||||
|
// - Small Caps (used for DEATH-style text in Terry Pratchett tradition)
|
||||||
|
// - Mathematical Fraktur lowercase (𝔞-𝔷)
|
||||||
|
// - Mathematical Fraktur uppercase (𝔄-ℨ, including Letterlike Symbols block exceptions)
|
||||||
|
func normalizeRune(r rune) rune { |
||||||
|
// Check small caps first (scattered codepoints)
|
||||||
|
if mapped, ok := smallCapsToASCII[r]; ok { |
||||||
|
return mapped |
||||||
|
} |
||||||
|
|
||||||
|
// Check fraktur lowercase: U+1D51E to U+1D537 (contiguous range)
|
||||||
|
if r >= 0x1D51E && r <= 0x1D537 { |
||||||
|
return 'a' + (r - 0x1D51E) |
||||||
|
} |
||||||
|
|
||||||
|
// Check fraktur uppercase main range: U+1D504 to U+1D51C (with gaps)
|
||||||
|
if r >= 0x1D504 && r <= 0x1D51C { |
||||||
|
if mapped, ok := frakturUpperToASCII[r]; ok { |
||||||
|
return mapped |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Check fraktur uppercase exceptions from Letterlike Symbols block
|
||||||
|
if mapped, ok := frakturLetterlikeToASCII[r]; ok { |
||||||
|
return mapped |
||||||
|
} |
||||||
|
|
||||||
|
return r |
||||||
|
} |
||||||
|
|
||||||
|
// smallCapsToASCII maps small capital letters to lowercase ASCII.
|
||||||
|
// These are scattered across multiple Unicode blocks (IPA Extensions,
|
||||||
|
// Phonetic Extensions, Latin Extended-D).
|
||||||
|
var smallCapsToASCII = map[rune]rune{ |
||||||
|
'ᴀ': 'a', // U+1D00 LATIN LETTER SMALL CAPITAL A
|
||||||
|
'ʙ': 'b', // U+0299 LATIN LETTER SMALL CAPITAL B
|
||||||
|
'ᴄ': 'c', // U+1D04 LATIN LETTER SMALL CAPITAL C
|
||||||
|
'ᴅ': 'd', // U+1D05 LATIN LETTER SMALL CAPITAL D
|
||||||
|
'ᴇ': 'e', // U+1D07 LATIN LETTER SMALL CAPITAL E
|
||||||
|
'ꜰ': 'f', // U+A730 LATIN LETTER SMALL CAPITAL F
|
||||||
|
'ɢ': 'g', // U+0262 LATIN LETTER SMALL CAPITAL G
|
||||||
|
'ʜ': 'h', // U+029C LATIN LETTER SMALL CAPITAL H
|
||||||
|
'ɪ': 'i', // U+026A LATIN LETTER SMALL CAPITAL I
|
||||||
|
'ᴊ': 'j', // U+1D0A LATIN LETTER SMALL CAPITAL J
|
||||||
|
'ᴋ': 'k', // U+1D0B LATIN LETTER SMALL CAPITAL K
|
||||||
|
'ʟ': 'l', // U+029F LATIN LETTER SMALL CAPITAL L
|
||||||
|
'ᴍ': 'm', // U+1D0D LATIN LETTER SMALL CAPITAL M
|
||||||
|
'ɴ': 'n', // U+0274 LATIN LETTER SMALL CAPITAL N
|
||||||
|
'ᴏ': 'o', // U+1D0F LATIN LETTER SMALL CAPITAL O
|
||||||
|
'ᴘ': 'p', // U+1D18 LATIN LETTER SMALL CAPITAL P
|
||||||
|
'ǫ': 'q', // U+01EB LATIN SMALL LETTER O WITH OGONEK (no true small cap Q)
|
||||||
|
'ʀ': 'r', // U+0280 LATIN LETTER SMALL CAPITAL R
|
||||||
|
'ꜱ': 's', // U+A731 LATIN LETTER SMALL CAPITAL S
|
||||||
|
'ᴛ': 't', // U+1D1B LATIN LETTER SMALL CAPITAL T
|
||||||
|
'ᴜ': 'u', // U+1D1C LATIN LETTER SMALL CAPITAL U
|
||||||
|
'ᴠ': 'v', // U+1D20 LATIN LETTER SMALL CAPITAL V
|
||||||
|
'ᴡ': 'w', // U+1D21 LATIN LETTER SMALL CAPITAL W
|
||||||
|
// Note: no small cap X exists in standard use
|
||||||
|
'ʏ': 'y', // U+028F LATIN LETTER SMALL CAPITAL Y
|
||||||
|
'ᴢ': 'z', // U+1D22 LATIN LETTER SMALL CAPITAL Z
|
||||||
|
} |
||||||
|
|
||||||
|
// frakturUpperToASCII maps Mathematical Fraktur uppercase letters to lowercase ASCII.
|
||||||
|
// The main range U+1D504-U+1D51C has gaps where C, H, I, R, Z use Letterlike Symbols.
|
||||||
|
var frakturUpperToASCII = map[rune]rune{ |
||||||
|
'𝔄': 'a', // U+1D504 MATHEMATICAL FRAKTUR CAPITAL A
|
||||||
|
'𝔅': 'b', // U+1D505 MATHEMATICAL FRAKTUR CAPITAL B
|
||||||
|
// C is at U+212D (Letterlike Symbols)
|
||||||
|
'𝔇': 'd', // U+1D507 MATHEMATICAL FRAKTUR CAPITAL D
|
||||||
|
'𝔈': 'e', // U+1D508 MATHEMATICAL FRAKTUR CAPITAL E
|
||||||
|
'𝔉': 'f', // U+1D509 MATHEMATICAL FRAKTUR CAPITAL F
|
||||||
|
'𝔊': 'g', // U+1D50A MATHEMATICAL FRAKTUR CAPITAL G
|
||||||
|
// H is at U+210C (Letterlike Symbols)
|
||||||
|
// I is at U+2111 (Letterlike Symbols)
|
||||||
|
'𝔍': 'j', // U+1D50D MATHEMATICAL FRAKTUR CAPITAL J
|
||||||
|
'𝔎': 'k', // U+1D50E MATHEMATICAL FRAKTUR CAPITAL K
|
||||||
|
'𝔏': 'l', // U+1D50F MATHEMATICAL FRAKTUR CAPITAL L
|
||||||
|
'𝔐': 'm', // U+1D510 MATHEMATICAL FRAKTUR CAPITAL M
|
||||||
|
'𝔑': 'n', // U+1D511 MATHEMATICAL FRAKTUR CAPITAL N
|
||||||
|
'𝔒': 'o', // U+1D512 MATHEMATICAL FRAKTUR CAPITAL O
|
||||||
|
'𝔓': 'p', // U+1D513 MATHEMATICAL FRAKTUR CAPITAL P
|
||||||
|
'𝔔': 'q', // U+1D514 MATHEMATICAL FRAKTUR CAPITAL Q
|
||||||
|
// R is at U+211C (Letterlike Symbols)
|
||||||
|
'𝔖': 's', // U+1D516 MATHEMATICAL FRAKTUR CAPITAL S
|
||||||
|
'𝔗': 't', // U+1D517 MATHEMATICAL FRAKTUR CAPITAL T
|
||||||
|
'𝔘': 'u', // U+1D518 MATHEMATICAL FRAKTUR CAPITAL U
|
||||||
|
'𝔙': 'v', // U+1D519 MATHEMATICAL FRAKTUR CAPITAL V
|
||||||
|
'𝔚': 'w', // U+1D51A MATHEMATICAL FRAKTUR CAPITAL W
|
||||||
|
'𝔛': 'x', // U+1D51B MATHEMATICAL FRAKTUR CAPITAL X
|
||||||
|
'𝔜': 'y', // U+1D51C MATHEMATICAL FRAKTUR CAPITAL Y
|
||||||
|
// Z is at U+2128 (Letterlike Symbols)
|
||||||
|
} |
||||||
|
|
||||||
|
// frakturLetterlikeToASCII maps the Fraktur characters that live in the
|
||||||
|
// Letterlike Symbols block (U+2100-U+214F) rather than Mathematical Alphanumeric Symbols.
|
||||||
|
var frakturLetterlikeToASCII = map[rune]rune{ |
||||||
|
'ℭ': 'c', // U+212D BLACK-LETTER CAPITAL C
|
||||||
|
'ℌ': 'h', // U+210C BLACK-LETTER CAPITAL H
|
||||||
|
'ℑ': 'i', // U+2111 BLACK-LETTER CAPITAL I
|
||||||
|
'ℜ': 'r', // U+211C BLACK-LETTER CAPITAL R
|
||||||
|
'ℨ': 'z', // U+2128 BLACK-LETTER CAPITAL Z
|
||||||
|
} |
||||||
|
|
||||||
|
// hasDecorativeUnicode checks if text contains any small caps or fraktur characters
|
||||||
|
// that would need normalization. Used by migration to identify events needing re-indexing.
|
||||||
|
func hasDecorativeUnicode(s string) bool { |
||||||
|
for _, r := range s { |
||||||
|
// Check small caps
|
||||||
|
if _, ok := smallCapsToASCII[r]; ok { |
||||||
|
return true |
||||||
|
} |
||||||
|
// Check fraktur lowercase range
|
||||||
|
if r >= 0x1D51E && r <= 0x1D537 { |
||||||
|
return true |
||||||
|
} |
||||||
|
// Check fraktur uppercase range
|
||||||
|
if r >= 0x1D504 && r <= 0x1D51C { |
||||||
|
return true |
||||||
|
} |
||||||
|
// Check letterlike symbols fraktur
|
||||||
|
if _, ok := frakturLetterlikeToASCII[r]; ok { |
||||||
|
return true |
||||||
|
} |
||||||
|
} |
||||||
|
return false |
||||||
|
} |
||||||
@ -0,0 +1,205 @@ |
|||||||
|
//go:build !(js && wasm)
|
||||||
|
|
||||||
|
package database |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"testing" |
||||||
|
) |
||||||
|
|
||||||
|
func TestNormalizeRune(t *testing.T) { |
||||||
|
tests := []struct { |
||||||
|
name string |
||||||
|
input rune |
||||||
|
expected rune |
||||||
|
}{ |
||||||
|
// Small caps
|
||||||
|
{"small cap A", 'ᴀ', 'a'}, |
||||||
|
{"small cap B", 'ʙ', 'b'}, |
||||||
|
{"small cap C", 'ᴄ', 'c'}, |
||||||
|
{"small cap D", 'ᴅ', 'd'}, |
||||||
|
{"small cap E", 'ᴇ', 'e'}, |
||||||
|
{"small cap F", 'ꜰ', 'f'}, |
||||||
|
{"small cap G", 'ɢ', 'g'}, |
||||||
|
{"small cap H", 'ʜ', 'h'}, |
||||||
|
{"small cap I", 'ɪ', 'i'}, |
||||||
|
{"small cap J", 'ᴊ', 'j'}, |
||||||
|
{"small cap K", 'ᴋ', 'k'}, |
||||||
|
{"small cap L", 'ʟ', 'l'}, |
||||||
|
{"small cap M", 'ᴍ', 'm'}, |
||||||
|
{"small cap N", 'ɴ', 'n'}, |
||||||
|
{"small cap O", 'ᴏ', 'o'}, |
||||||
|
{"small cap P", 'ᴘ', 'p'}, |
||||||
|
{"small cap Q (ogonek)", 'ǫ', 'q'}, |
||||||
|
{"small cap R", 'ʀ', 'r'}, |
||||||
|
{"small cap S", 'ꜱ', 's'}, |
||||||
|
{"small cap T", 'ᴛ', 't'}, |
||||||
|
{"small cap U", 'ᴜ', 'u'}, |
||||||
|
{"small cap V", 'ᴠ', 'v'}, |
||||||
|
{"small cap W", 'ᴡ', 'w'}, |
||||||
|
{"small cap Y", 'ʏ', 'y'}, |
||||||
|
{"small cap Z", 'ᴢ', 'z'}, |
||||||
|
|
||||||
|
// Fraktur lowercase
|
||||||
|
{"fraktur lower a", '𝔞', 'a'}, |
||||||
|
{"fraktur lower b", '𝔟', 'b'}, |
||||||
|
{"fraktur lower c", '𝔠', 'c'}, |
||||||
|
{"fraktur lower d", '𝔡', 'd'}, |
||||||
|
{"fraktur lower e", '𝔢', 'e'}, |
||||||
|
{"fraktur lower f", '𝔣', 'f'}, |
||||||
|
{"fraktur lower g", '𝔤', 'g'}, |
||||||
|
{"fraktur lower h", '𝔥', 'h'}, |
||||||
|
{"fraktur lower i", '𝔦', 'i'}, |
||||||
|
{"fraktur lower j", '𝔧', 'j'}, |
||||||
|
{"fraktur lower k", '𝔨', 'k'}, |
||||||
|
{"fraktur lower l", '𝔩', 'l'}, |
||||||
|
{"fraktur lower m", '𝔪', 'm'}, |
||||||
|
{"fraktur lower n", '𝔫', 'n'}, |
||||||
|
{"fraktur lower o", '𝔬', 'o'}, |
||||||
|
{"fraktur lower p", '𝔭', 'p'}, |
||||||
|
{"fraktur lower q", '𝔮', 'q'}, |
||||||
|
{"fraktur lower r", '𝔯', 'r'}, |
||||||
|
{"fraktur lower s", '𝔰', 's'}, |
||||||
|
{"fraktur lower t", '𝔱', 't'}, |
||||||
|
{"fraktur lower u", '𝔲', 'u'}, |
||||||
|
{"fraktur lower v", '𝔳', 'v'}, |
||||||
|
{"fraktur lower w", '𝔴', 'w'}, |
||||||
|
{"fraktur lower x", '𝔵', 'x'}, |
||||||
|
{"fraktur lower y", '𝔶', 'y'}, |
||||||
|
{"fraktur lower z", '𝔷', 'z'}, |
||||||
|
|
||||||
|
// Fraktur uppercase (main range)
|
||||||
|
{"fraktur upper A", '𝔄', 'a'}, |
||||||
|
{"fraktur upper B", '𝔅', 'b'}, |
||||||
|
{"fraktur upper D", '𝔇', 'd'}, |
||||||
|
{"fraktur upper E", '𝔈', 'e'}, |
||||||
|
{"fraktur upper F", '𝔉', 'f'}, |
||||||
|
{"fraktur upper G", '𝔊', 'g'}, |
||||||
|
{"fraktur upper J", '𝔍', 'j'}, |
||||||
|
{"fraktur upper K", '𝔎', 'k'}, |
||||||
|
{"fraktur upper L", '𝔏', 'l'}, |
||||||
|
{"fraktur upper M", '𝔐', 'm'}, |
||||||
|
{"fraktur upper N", '𝔑', 'n'}, |
||||||
|
{"fraktur upper O", '𝔒', 'o'}, |
||||||
|
{"fraktur upper P", '𝔓', 'p'}, |
||||||
|
{"fraktur upper Q", '𝔔', 'q'}, |
||||||
|
{"fraktur upper S", '𝔖', 's'}, |
||||||
|
{"fraktur upper T", '𝔗', 't'}, |
||||||
|
{"fraktur upper U", '𝔘', 'u'}, |
||||||
|
{"fraktur upper V", '𝔙', 'v'}, |
||||||
|
{"fraktur upper W", '𝔚', 'w'}, |
||||||
|
{"fraktur upper X", '𝔛', 'x'}, |
||||||
|
{"fraktur upper Y", '𝔜', 'y'}, |
||||||
|
|
||||||
|
// Fraktur uppercase (Letterlike Symbols block)
|
||||||
|
{"fraktur upper C (letterlike)", 'ℭ', 'c'}, |
||||||
|
{"fraktur upper H (letterlike)", 'ℌ', 'h'}, |
||||||
|
{"fraktur upper I (letterlike)", 'ℑ', 'i'}, |
||||||
|
{"fraktur upper R (letterlike)", 'ℜ', 'r'}, |
||||||
|
{"fraktur upper Z (letterlike)", 'ℨ', 'z'}, |
||||||
|
|
||||||
|
// Regular ASCII should pass through unchanged
|
||||||
|
{"regular lowercase a", 'a', 'a'}, |
||||||
|
{"regular lowercase z", 'z', 'z'}, |
||||||
|
{"regular uppercase A", 'A', 'A'}, |
||||||
|
{"regular digit 5", '5', '5'}, |
||||||
|
|
||||||
|
// Other unicode should pass through unchanged
|
||||||
|
{"cyrillic д", 'д', 'д'}, |
||||||
|
{"greek α", 'α', 'α'}, |
||||||
|
{"emoji", '🎉', '🎉'}, |
||||||
|
} |
||||||
|
|
||||||
|
for _, tt := range tests { |
||||||
|
t.Run(tt.name, func(t *testing.T) { |
||||||
|
result := normalizeRune(tt.input) |
||||||
|
if result != tt.expected { |
||||||
|
t.Errorf("normalizeRune(%q) = %q, want %q", tt.input, result, tt.expected) |
||||||
|
} |
||||||
|
}) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func TestHasDecorativeUnicode(t *testing.T) { |
||||||
|
tests := []struct { |
||||||
|
name string |
||||||
|
input string |
||||||
|
expected bool |
||||||
|
}{ |
||||||
|
{"plain ASCII", "hello world", false}, |
||||||
|
{"small caps word", "ᴅᴇᴀᴛʜ", true}, |
||||||
|
{"fraktur lowercase", "𝔥𝔢𝔩𝔩𝔬", true}, |
||||||
|
{"fraktur uppercase", "𝔇𝔈𝔄𝔗ℌ", true}, |
||||||
|
{"mixed with ASCII", "hello ᴡᴏʀʟᴅ", true}, |
||||||
|
{"single small cap", "aᴀa", true}, |
||||||
|
{"cyrillic (no normalize)", "привет", false}, |
||||||
|
{"empty string", "", false}, |
||||||
|
{"letterlike fraktur C", "ℭool", true}, |
||||||
|
} |
||||||
|
|
||||||
|
for _, tt := range tests { |
||||||
|
t.Run(tt.name, func(t *testing.T) { |
||||||
|
result := hasDecorativeUnicode(tt.input) |
||||||
|
if result != tt.expected { |
||||||
|
t.Errorf("hasDecorativeUnicode(%q) = %v, want %v", tt.input, result, tt.expected) |
||||||
|
} |
||||||
|
}) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func TestTokenHashesNormalization(t *testing.T) { |
||||||
|
// All three representations should produce the same hash
|
||||||
|
ascii := TokenHashes([]byte("death")) |
||||||
|
smallCaps := TokenHashes([]byte("ᴅᴇᴀᴛʜ")) |
||||||
|
frakturLower := TokenHashes([]byte("𝔡𝔢𝔞𝔱𝔥")) |
||||||
|
frakturUpper := TokenHashes([]byte("𝔇𝔈𝔄𝔗ℌ")) |
||||||
|
|
||||||
|
if len(ascii) != 1 { |
||||||
|
t.Fatalf("expected 1 hash for 'death', got %d", len(ascii)) |
||||||
|
} |
||||||
|
if len(smallCaps) != 1 { |
||||||
|
t.Fatalf("expected 1 hash for small caps, got %d", len(smallCaps)) |
||||||
|
} |
||||||
|
if len(frakturLower) != 1 { |
||||||
|
t.Fatalf("expected 1 hash for fraktur lower, got %d", len(frakturLower)) |
||||||
|
} |
||||||
|
if len(frakturUpper) != 1 { |
||||||
|
t.Fatalf("expected 1 hash for fraktur upper, got %d", len(frakturUpper)) |
||||||
|
} |
||||||
|
|
||||||
|
// All should match the ASCII version
|
||||||
|
if !bytes.Equal(ascii[0], smallCaps[0]) { |
||||||
|
t.Errorf("small caps hash differs from ASCII\nASCII: %x\nsmall caps: %x", ascii[0], smallCaps[0]) |
||||||
|
} |
||||||
|
if !bytes.Equal(ascii[0], frakturLower[0]) { |
||||||
|
t.Errorf("fraktur lower hash differs from ASCII\nASCII: %x\nfraktur lower: %x", ascii[0], frakturLower[0]) |
||||||
|
} |
||||||
|
if !bytes.Equal(ascii[0], frakturUpper[0]) { |
||||||
|
t.Errorf("fraktur upper hash differs from ASCII\nASCII: %x\nfraktur upper: %x", ascii[0], frakturUpper[0]) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func TestTokenHashesMixedContent(t *testing.T) { |
||||||
|
// Test that mixed content normalizes correctly
|
||||||
|
content := []byte("ᴛʜᴇ quick 𝔟𝔯𝔬𝔴𝔫 fox") |
||||||
|
hashes := TokenHashes(content) |
||||||
|
|
||||||
|
// Should get: "the", "quick", "brown", "fox" (4 unique words)
|
||||||
|
if len(hashes) != 4 { |
||||||
|
t.Errorf("expected 4 hashes from mixed content, got %d", len(hashes)) |
||||||
|
} |
||||||
|
|
||||||
|
// Verify "the" matches between decorated and plain
|
||||||
|
thePlain := TokenHashes([]byte("the")) |
||||||
|
theDecorated := TokenHashes([]byte("ᴛʜᴇ")) |
||||||
|
if !bytes.Equal(thePlain[0], theDecorated[0]) { |
||||||
|
t.Errorf("'the' hash mismatch: plain=%x, decorated=%x", thePlain[0], theDecorated[0]) |
||||||
|
} |
||||||
|
|
||||||
|
// Verify "brown" matches between decorated and plain
|
||||||
|
brownPlain := TokenHashes([]byte("brown")) |
||||||
|
brownDecorated := TokenHashes([]byte("𝔟𝔯𝔬𝔴𝔫")) |
||||||
|
if !bytes.Equal(brownPlain[0], brownDecorated[0]) { |
||||||
|
t.Errorf("'brown' hash mismatch: plain=%x, decorated=%x", brownPlain[0], brownDecorated[0]) |
||||||
|
} |
||||||
|
} |
||||||
Loading…
Reference in new issue