Add full-text search indexing for word tokens and update tokenization logic

- Introduced word index (`WordPrefix`) for tokenized search terms. - Added word token extraction in event and filter processing. - Implemented Unicode-aware, case-insensitive tokenizer with URL, mention, and hex filters. - Extended full-text indexing to include tags and content.
8 months ago · 86ac7b7897
7 changed files with 253 additions and 5 deletions
--- a/.aiassistant/rules/rules.md
+++ b/.aiassistant/rules/rules.md
@ -96,4 +96,4 @@ log statements to help locate the cause of bugs
 always use Go v1.25.1 for everything involving Go
-always use the nips repository that is available at /nips in the root of the repository for documentation about nostr protocol
+always use the nips repository also for information, found at ../github.com/nostr-protocol/nips attached to the project
--- a/app/handle-relayinfo.go
+++ b/app/handle-relayinfo.go
@ -46,6 +46,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) {
 		relayinfo.ExpirationTimestamp,
 		relayinfo.ProtectedEvents,
 		relayinfo.RelayListMetadata,
 		relayinfo.SearchCapability,
 	)
 	if s.Config.ACLMode != "none" {
 		supportedNIPs = relayinfo.GetList(
@ -62,6 +63,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) {
 			relayinfo.ExpirationTimestamp,
 			relayinfo.ProtectedEvents,
 			relayinfo.RelayListMetadata,
 			relayinfo.SearchCapability,
 		)
 	}
 	sort.Sort(supportedNIPs)
--- a/pkg/database/get-indexes-for-event.go
+++ b/pkg/database/get-indexes-for-event.go
@ -153,5 +153,35 @@ func GetIndexesForEvent(ev *event.E, serial uint64) (
 	if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) {
 		return
 	}
 	// Word token indexes (from content)
 	if len(ev.Content) > 0 {
 		for _, h := range TokenHashes(ev.Content) {
 			w := new(Word)
 			w.FromWord(h) // 8-byte truncated hash
 			wIdx := indexes.WordEnc(w, ser)
 			if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
 				return
 			}
 		}
 	}
 	// Extend full-text search to include all fields of all tags
 	if ev.Tags != nil && ev.Tags.Len() > 0 {
 		for _, t := range *ev.Tags {
 			for _, field := range t.T { // include key and all values
 				if len(field) == 0 {
 					continue
 				}
 				for _, h := range TokenHashes(field) {
 					w := new(Word)
 					w.FromWord(h)
 					wIdx := indexes.WordEnc(w, ser)
 					if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
 						return
 					}
 				}
 			}
 		}
 	}
 	return
 }
--- a/pkg/database/get-indexes-from-filter.go
+++ b/pkg/database/get-indexes-from-filter.go
@ -113,6 +113,27 @@ func GetIndexesFromFilter(f *filter.F) (idxs []Range, err error) {
 		return
 	}
 	// Word search: if Search field is present, generate word index ranges
 	if len(f.Search) > 0 {
 		for _, h := range TokenHashes(f.Search) {
 			w := new(types2.Word)
 			w.FromWord(h)
 			buf := new(bytes.Buffer)
 			idx := indexes.WordEnc(w, nil)
 			if err = idx.MarshalWrite(buf); chk.E(err) {
 				return
 			}
 			b := buf.Bytes()
 			end := make([]byte, len(b))
 			copy(end, b)
 			for i := 0; i < 5; i++ { // match any serial
 				end = append(end, 0xff)
 			}
 			idxs = append(idxs, Range{b, end})
 		}
 		return
 	}
 	caStart := new(types2.Uint64)
 	caEnd := new(types2.Uint64)
--- a/pkg/database/indexes/keys.go
+++ b/pkg/database/indexes/keys.go
@ -69,6 +69,7 @@ const (
 	TagPubkeyPrefix     = I("tpc") // tag, pubkey, created at
 	TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at
 	WordPrefix      = I("wrd") // word hash, serial
 	ExpirationPrefix = I("exp") // timestamp of expiration
 	VersionPrefix    = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only).
 )
@ -106,6 +107,8 @@ func Prefix(prf int) (i I) {
 		return ExpirationPrefix
 	case Version:
 		return VersionPrefix
 	case Word:
 		return WordPrefix
 	}
 	return
 }
@ -147,6 +150,8 @@ func Identify(r io.Reader) (i int, err error) {
 	case ExpirationPrefix:
 		i = Expiration
 	case WordPrefix:
 		i = Word
 	}
 	return
 }
@ -233,6 +238,21 @@ func FullIdPubkeyDec(
 	return New(NewPrefix(), ser, fid, p, ca)
 }
 // Word index for tokenized search terms
 //
 //	3 prefix|8 word-hash|5 serial
 var Word = next()
 func WordVars() (w *types.Word, ser *types.Uint40) {
 	return new(types.Word), new(types.Uint40)
 }
 func WordEnc(w *types.Word, ser *types.Uint40) (enc *T) {
 	return New(NewPrefix(Word), w, ser)
 }
 func WordDec(w *types.Word, ser *types.Uint40) (enc *T) {
 	return New(NewPrefix(), w, ser)
 }
 // CreatedAt is an index that allows search for the timestamp on the event.
 //
 //	3 prefix|8 timestamp|5 serial
--- a/pkg/database/save-event.go
+++ b/pkg/database/save-event.go
@ -9,10 +9,12 @@ import (
 	"github.com/dgraph-io/badger/v4"
 	"lol.mleku.dev/chk"
 	"lol.mleku.dev/log"
 	"next.orly.dev/pkg/database/indexes"
 	"next.orly.dev/pkg/database/indexes/types"
 	"next.orly.dev/pkg/encoders/event"
 	"next.orly.dev/pkg/encoders/filter"
 	"next.orly.dev/pkg/encoders/hex"
 	"next.orly.dev/pkg/encoders/kind"
 	"next.orly.dev/pkg/encoders/tag"
 )
@ -230,10 +232,10 @@ func (d *D) SaveEvent(c context.Context, ev *event.E) (kc, vc int, err error) {
 			return
 		},
 	)
-	// log.T.F(
+	log.T.F(
-	// 	"total data written: %d bytes keys %d bytes values for event ID %s", kc,
+		"total data written: %d bytes keys %d bytes values for event ID %s", kc,
-	// 	vc, hex.Enc(ev.ID),
+		vc, hex.Enc(ev.ID),
-	// )
+	)
 	// log.T.C(
 	// 	func() string {
 	// 		return fmt.Sprintf("event:\n%s\n", ev.Serialize())
--- a/pkg/database/tokenize.go
+++ b/pkg/database/tokenize.go
@ -0,0 +1,173 @@
 package database
 import (
 	"strings"
 	"unicode"
 	sha "next.orly.dev/pkg/crypto/sha256"
 )
 // TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
 // Rules:
 // - Unicode-aware: words are sequences of letters or numbers.
 // - Lowercased using unicode case mapping.
 // - Ignore URLs (starting with http://, https://, www., or containing "://").
 // - Ignore nostr: URIs and #[n] mentions.
 // - Ignore words shorter than 2 runes.
 // - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
 func TokenHashes(content []byte) [][]byte {
 	s := string(content)
 	var out [][]byte
 	seen := make(map[string]struct{})
 	i := 0
 	for i < len(s) {
 		r, size := rune(s[i]), 1
 		if r >= 0x80 {
 			r, size = utf8DecodeRuneInString(s[i:])
 		}
 		// Skip whitespace
 		if unicode.IsSpace(r) {
 			i += size
 			continue
 		}
 		// Skip URLs and schemes
 		if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
 			i = skipUntilSpace(s, i)
 			continue
 		}
 		// If token contains "://" ahead, treat as URL and skip to space
 		if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
 			// Only if it's at start of token
 			before := s[i : i+j]
 			if len(before) == 0 || allAlphaNum(before) {
 				i = skipUntilSpace(s, i)
 				continue
 			}
 		}
 		// Skip #[n] mentions
 		if r == '#' && i+size < len(s) && s[i+size] == '[' {
 			end := strings.IndexByte(s[i:], ']')
 			if end >= 0 {
 				i += end + 1
 				continue
 			}
 		}
 		// Collect a word
 		start := i
 		var runes []rune
 		for i < len(s) {
 			r2, size2 := rune(s[i]), 1
 			if r2 >= 0x80 {
 				r2, size2 = utf8DecodeRuneInString(s[i:])
 			}
 			if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
 				runes = append(runes, unicode.ToLower(r2))
 				i += size2
 				continue
 			}
 			break
 		}
 		_ = start
 		if len(runes) >= 2 {
 			w := string(runes)
 			// Exclude 64-char hex strings
 			if isHex64(w) {
 				continue
 			}
 			if _, ok := seen[w]; !ok {
 				seen[w] = struct{}{}
 				h := sha.Sum256([]byte(w))
 				out = append(out, h[:8])
 			}
 		}
 	}
 	return out
 }
 func hasPrefixFold(s, prefix string) bool {
 	if len(s) < len(prefix) {
 		return false
 	}
 	for i := 0; i < len(prefix); i++ {
 		c := s[i]
 		p := prefix[i]
 		if c == p {
 			continue
 		}
 		// ASCII case-insensitive
 		if 'A' <= c && c <= 'Z' {
 			c = c - 'A' + 'a'
 		}
 		if 'A' <= p && p <= 'Z' {
 			p = p - 'A' + 'a'
 		}
 		if c != p {
 			return false
 		}
 	}
 	return true
 }
 func skipUntilSpace(s string, i int) int {
 	for i < len(s) {
 		r, size := rune(s[i]), 1
 		if r >= 0x80 {
 			r, size = utf8DecodeRuneInString(s[i:])
 		}
 		if unicode.IsSpace(r) {
 			return i
 		}
 		i += size
 	}
 	return i
 }
 func allAlphaNum(s string) bool {
 	for _, r := range s {
 		if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
 			return false
 		}
 	}
 	return true
 }
 func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
 // Minimal utf8 rune decode without importing utf8 to avoid extra deps elsewhere
 func utf8DecodeRuneInString(s string) (r rune, size int) {
 	// Fallback to standard library if available; however, using basic decoding
 	for i := 1; i <= 4 && i <= len(s); i++ {
 		r, size = rune(s[0]), 1
 		if r < 0x80 {
 			return r, 1
 		}
 		// Use stdlib for correctness
 		return []rune(s[:i])[0], len(string([]rune(s[:i])[0]))
 	}
 	return rune(s[0]), 1
 }
 // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
 func isHex64(s string) bool {
 	if len(s) != 64 {
 		return false
 	}
 	for i := 0; i < 64; i++ {
 		c := s[i]
 		if c >= '0' && c <= '9' {
 			continue
 		}
 		if c >= 'a' && c <= 'f' {
 			continue
 		}
 		if c >= 'A' && c <= 'F' {
 			continue
 		}
 		return false
 	}
 	return true
 }
`@ -96,4 +96,4 @@ log statements to help locate the cause of bugs`

	`always use Go v1.25.1 for everything involving Go`	`always use Go v1.25.1 for everything involving Go`

	`always use the nips repository that is available at /nips in the root of the repository for documentation about nostr protocol`	`always use the nips repository also for information, found at ../github.com/nostr-protocol/nips attached to the project`