You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
135 lines
5.3 KiB
135 lines
5.3 KiB
//go:build js && wasm |
|
|
|
package database |
|
|
|
// normalizeRune maps decorative unicode characters (small caps, fraktur) back to |
|
// their ASCII equivalents for consistent word indexing. This ensures that text |
|
// written with decorative alphabets (e.g., "ᴅᴇᴀᴛʜ" or "𝔇𝔢𝔞𝔱𝔥") indexes the same |
|
// as regular ASCII ("death"). |
|
// |
|
// Character sets normalized: |
|
// - Small Caps (used for DEATH-style text in Terry Pratchett tradition) |
|
// - Mathematical Fraktur lowercase (𝔞-𝔷) |
|
// - Mathematical Fraktur uppercase (𝔄-ℨ, including Letterlike Symbols block exceptions) |
|
func normalizeRune(r rune) rune { |
|
// Check small caps first (scattered codepoints) |
|
if mapped, ok := smallCapsToASCII[r]; ok { |
|
return mapped |
|
} |
|
|
|
// Check fraktur lowercase: U+1D51E to U+1D537 (contiguous range) |
|
if r >= 0x1D51E && r <= 0x1D537 { |
|
return 'a' + (r - 0x1D51E) |
|
} |
|
|
|
// Check fraktur uppercase main range: U+1D504 to U+1D51C (with gaps) |
|
if r >= 0x1D504 && r <= 0x1D51C { |
|
if mapped, ok := frakturUpperToASCII[r]; ok { |
|
return mapped |
|
} |
|
} |
|
|
|
// Check fraktur uppercase exceptions from Letterlike Symbols block |
|
if mapped, ok := frakturLetterlikeToASCII[r]; ok { |
|
return mapped |
|
} |
|
|
|
return r |
|
} |
|
|
|
// smallCapsToASCII maps small capital letters to lowercase ASCII. |
|
// These are scattered across multiple Unicode blocks (IPA Extensions, |
|
// Phonetic Extensions, Latin Extended-D). |
|
var smallCapsToASCII = map[rune]rune{ |
|
'ᴀ': 'a', // U+1D00 LATIN LETTER SMALL CAPITAL A |
|
'ʙ': 'b', // U+0299 LATIN LETTER SMALL CAPITAL B |
|
'ᴄ': 'c', // U+1D04 LATIN LETTER SMALL CAPITAL C |
|
'ᴅ': 'd', // U+1D05 LATIN LETTER SMALL CAPITAL D |
|
'ᴇ': 'e', // U+1D07 LATIN LETTER SMALL CAPITAL E |
|
'ꜰ': 'f', // U+A730 LATIN LETTER SMALL CAPITAL F |
|
'ɢ': 'g', // U+0262 LATIN LETTER SMALL CAPITAL G |
|
'ʜ': 'h', // U+029C LATIN LETTER SMALL CAPITAL H |
|
'ɪ': 'i', // U+026A LATIN LETTER SMALL CAPITAL I |
|
'ᴊ': 'j', // U+1D0A LATIN LETTER SMALL CAPITAL J |
|
'ᴋ': 'k', // U+1D0B LATIN LETTER SMALL CAPITAL K |
|
'ʟ': 'l', // U+029F LATIN LETTER SMALL CAPITAL L |
|
'ᴍ': 'm', // U+1D0D LATIN LETTER SMALL CAPITAL M |
|
'ɴ': 'n', // U+0274 LATIN LETTER SMALL CAPITAL N |
|
'ᴏ': 'o', // U+1D0F LATIN LETTER SMALL CAPITAL O |
|
'ᴘ': 'p', // U+1D18 LATIN LETTER SMALL CAPITAL P |
|
'ǫ': 'q', // U+01EB LATIN SMALL LETTER O WITH OGONEK (no true small cap Q) |
|
'ʀ': 'r', // U+0280 LATIN LETTER SMALL CAPITAL R |
|
'ꜱ': 's', // U+A731 LATIN LETTER SMALL CAPITAL S |
|
'ᴛ': 't', // U+1D1B LATIN LETTER SMALL CAPITAL T |
|
'ᴜ': 'u', // U+1D1C LATIN LETTER SMALL CAPITAL U |
|
'ᴠ': 'v', // U+1D20 LATIN LETTER SMALL CAPITAL V |
|
'ᴡ': 'w', // U+1D21 LATIN LETTER SMALL CAPITAL W |
|
// Note: no small cap X exists in standard use |
|
'ʏ': 'y', // U+028F LATIN LETTER SMALL CAPITAL Y |
|
'ᴢ': 'z', // U+1D22 LATIN LETTER SMALL CAPITAL Z |
|
} |
|
|
|
// frakturUpperToASCII maps Mathematical Fraktur uppercase letters to lowercase ASCII. |
|
// The main range U+1D504-U+1D51C has gaps where C, H, I, R, Z use Letterlike Symbols. |
|
var frakturUpperToASCII = map[rune]rune{ |
|
'𝔄': 'a', // U+1D504 MATHEMATICAL FRAKTUR CAPITAL A |
|
'𝔅': 'b', // U+1D505 MATHEMATICAL FRAKTUR CAPITAL B |
|
// C is at U+212D (Letterlike Symbols) |
|
'𝔇': 'd', // U+1D507 MATHEMATICAL FRAKTUR CAPITAL D |
|
'𝔈': 'e', // U+1D508 MATHEMATICAL FRAKTUR CAPITAL E |
|
'𝔉': 'f', // U+1D509 MATHEMATICAL FRAKTUR CAPITAL F |
|
'𝔊': 'g', // U+1D50A MATHEMATICAL FRAKTUR CAPITAL G |
|
// H is at U+210C (Letterlike Symbols) |
|
// I is at U+2111 (Letterlike Symbols) |
|
'𝔍': 'j', // U+1D50D MATHEMATICAL FRAKTUR CAPITAL J |
|
'𝔎': 'k', // U+1D50E MATHEMATICAL FRAKTUR CAPITAL K |
|
'𝔏': 'l', // U+1D50F MATHEMATICAL FRAKTUR CAPITAL L |
|
'𝔐': 'm', // U+1D510 MATHEMATICAL FRAKTUR CAPITAL M |
|
'𝔑': 'n', // U+1D511 MATHEMATICAL FRAKTUR CAPITAL N |
|
'𝔒': 'o', // U+1D512 MATHEMATICAL FRAKTUR CAPITAL O |
|
'𝔓': 'p', // U+1D513 MATHEMATICAL FRAKTUR CAPITAL P |
|
'𝔔': 'q', // U+1D514 MATHEMATICAL FRAKTUR CAPITAL Q |
|
// R is at U+211C (Letterlike Symbols) |
|
'𝔖': 's', // U+1D516 MATHEMATICAL FRAKTUR CAPITAL S |
|
'𝔗': 't', // U+1D517 MATHEMATICAL FRAKTUR CAPITAL T |
|
'𝔘': 'u', // U+1D518 MATHEMATICAL FRAKTUR CAPITAL U |
|
'𝔙': 'v', // U+1D519 MATHEMATICAL FRAKTUR CAPITAL V |
|
'𝔚': 'w', // U+1D51A MATHEMATICAL FRAKTUR CAPITAL W |
|
'𝔛': 'x', // U+1D51B MATHEMATICAL FRAKTUR CAPITAL X |
|
'𝔜': 'y', // U+1D51C MATHEMATICAL FRAKTUR CAPITAL Y |
|
// Z is at U+2128 (Letterlike Symbols) |
|
} |
|
|
|
// frakturLetterlikeToASCII maps the Fraktur characters that live in the |
|
// Letterlike Symbols block (U+2100-U+214F) rather than Mathematical Alphanumeric Symbols. |
|
var frakturLetterlikeToASCII = map[rune]rune{ |
|
'ℭ': 'c', // U+212D BLACK-LETTER CAPITAL C |
|
'ℌ': 'h', // U+210C BLACK-LETTER CAPITAL H |
|
'ℑ': 'i', // U+2111 BLACK-LETTER CAPITAL I |
|
'ℜ': 'r', // U+211C BLACK-LETTER CAPITAL R |
|
'ℨ': 'z', // U+2128 BLACK-LETTER CAPITAL Z |
|
} |
|
|
|
// hasDecorativeUnicode checks if text contains any small caps or fraktur characters |
|
// that would need normalization. Used by migration to identify events needing re-indexing. |
|
func hasDecorativeUnicode(s string) bool { |
|
for _, r := range s { |
|
// Check small caps |
|
if _, ok := smallCapsToASCII[r]; ok { |
|
return true |
|
} |
|
// Check fraktur lowercase range |
|
if r >= 0x1D51E && r <= 0x1D537 { |
|
return true |
|
} |
|
// Check fraktur uppercase range |
|
if r >= 0x1D504 && r <= 0x1D51C { |
|
return true |
|
} |
|
// Check letterlike symbols fraktur |
|
if _, ok := frakturLetterlikeToASCII[r]; ok { |
|
return true |
|
} |
|
} |
|
return false |
|
}
|
|
|