From 3fb3526194e54b6db816a1d12f2793a5de69503e Mon Sep 17 00:00:00 2001 From: Silberengel Date: Mon, 13 Oct 2025 11:29:53 +0200 Subject: [PATCH] normalizing hashtags --- src/constants.ts | 2 +- src/lib/discussion-topics.ts | 54 ++++++++--- src/lib/draft-event.ts | 15 ++- src/pages/primary/DiscussionsPage/index.tsx | 102 ++------------------ 4 files changed, 60 insertions(+), 113 deletions(-) diff --git a/src/constants.ts b/src/constants.ts index 19882b3..d665b53 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -160,7 +160,7 @@ export const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/ export const EMOJI_SHORT_CODE_REGEX = /:[a-zA-Z0-9_-]+:/g export const EMBEDDED_EVENT_REGEX = /nostr:(note1[a-z0-9]{58}|nevent1[a-z0-9]+|naddr1[a-z0-9]+)/g export const EMBEDDED_MENTION_REGEX = /nostr:(npub1[a-z0-9]{58}|nprofile1[a-z0-9]+)/g -export const HASHTAG_REGEX = /#[a-zA-Z0-9_\u00C0-\u017F\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]+/g +export const HASHTAG_REGEX = /#[a-zA-Z0-9_\-\u00C0-\u017F\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]+/g export const LN_INVOICE_REGEX = /(ln(?:bc|tb|bcrt))([0-9]+[munp]?)?1([02-9ac-hj-np-z]+)/g export const EMOJI_REGEX = /[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA70}-\u{1FAFF}]|[\u{1F004}]|[\u{1F0CF}]|[\u{1F18E}]|[\u{3030}]|[\u{2B50}]|[\u{2B55}]|[\u{2934}-\u{2935}]|[\u{2B05}-\u{2B07}]|[\u{2B1B}-\u{2B1C}]|[\u{3297}]|[\u{3299}]|[\u{303D}]|[\u{00A9}]|[\u{00AE}]|[\u{2122}]|[\u{23E9}-\u{23EF}]|[\u{23F0}]|[\u{23F3}]|[\u{FE00}-\u{FE0F}]|[\u{200D}]/gu diff --git a/src/lib/discussion-topics.ts b/src/lib/discussion-topics.ts index f2ff943..d8b8d45 100644 --- a/src/lib/discussion-topics.ts +++ b/src/lib/discussion-topics.ts @@ -2,39 +2,53 @@ import { HASHTAG_REGEX } from '@/constants' import { NostrEvent } from 'nostr-tools' /** - * Normalize a topic string to lowercase with hyphens, no spaces - * Also converts plurals to singular form + * Normalize a hashtag/topic string + * @param text The text to normalize + * @param replaceSpaces Whether to replace spaces with hyphens (true for t-tags, false for content hashtags) + * @returns Normalized string (lowercase, filtered characters, singular form) */ -export function normalizeTopic(topic: string): string { - let normalized = topic - .toLowerCase() - .replace(/\s+/g, '-') - .replace(/[^a-z0-9-]/g, '') - .replace(/-+/g, '-') - .replace(/^-|-$/g, '') +export function normalizeHashtag(text: string, replaceSpaces: boolean = true): string { + // Convert to lowercase and optionally replace spaces with hyphens + let normalized = text.toLowerCase() + if (replaceSpaces) { + normalized = normalized.replace(/\s+/g, '-') + } + + // Only allow letters, numbers, hyphens, and underscores + normalized = normalized.replace(/[^a-z0-9_-]/g, '') + + // Clean up multiple consecutive hyphens/underscores + normalized = normalized.replace(/[-_]+/g, '-') + + // Remove leading/trailing hyphens/underscores + normalized = normalized.replace(/^[-_]+|[-_]+$/g, '') + + // Reject hashtags that are only numbers + if (/^[0-9]+$/.test(normalized)) { + return '' + } + + // Reject empty strings + if (!normalized) { + return '' + } // Convert plural to singular (simple English plurals) // Handle common cases: -ies -> -y, -es -> (sometimes), -s -> remove if (normalized.endsWith('ies') && normalized.length > 4) { // cities -> city, berries -> berry normalized = normalized.slice(0, -3) + 'y' - } else if (normalized.endsWith('ves') && normalized.length > 4) { - // wives -> wife, knives -> knife - normalized = normalized.slice(0, -3) + 'fe' } else if (normalized.endsWith('ses') && normalized.length > 4) { // classes -> class, bosses -> boss normalized = normalized.slice(0, -2) } else if (normalized.endsWith('xes') && normalized.length > 4) { // boxes -> box normalized = normalized.slice(0, -2) - } else if (normalized.endsWith('shes') && normalized.length > 5) { - // dishes -> dish - normalized = normalized.slice(0, -2) } else if (normalized.endsWith('ches') && normalized.length > 5) { // churches -> church normalized = normalized.slice(0, -2) } else if (normalized.endsWith('s') && normalized.length > 2) { - // Simple plural: cats -> cat, bitcoins -> bitcoin + // Simple plural: cats -> cat, bitcoins -> bitcoin, Christians -> Christian // But avoid removing 's' from words that naturally end in 's' // Check if second-to-last character is not 's' to avoid "ss" words const secondLast = normalized[normalized.length - 2] @@ -46,6 +60,14 @@ export function normalizeTopic(topic: string): string { return normalized } +/** + * Normalize a topic string (t-tags) - replaces spaces with hyphens + * Alias for normalizeHashtag with replaceSpaces=true + */ +export function normalizeTopic(topic: string): string { + return normalizeHashtag(topic, true) +} + /** * Extract hashtags from content */ diff --git a/src/lib/draft-event.ts b/src/lib/draft-event.ts index 339c922..1649f8b 100644 --- a/src/lib/draft-event.ts +++ b/src/lib/draft-event.ts @@ -3,6 +3,7 @@ import client from '@/services/client.service' import customEmojiService from '@/services/custom-emoji.service' import mediaUpload from '@/services/media-upload.service' import { prefixNostrAddresses } from '@/lib/nostr-address' +import { normalizeHashtag } from '@/lib/discussion-topics' import { TDraftEvent, TEmoji, @@ -742,11 +743,17 @@ async function extractCommentMentions(content: string, parentEvent: Event) { function extractHashtags(content: string) { const hashtags: string[] = [] - const matches = content.match(/#[\p{L}\p{N}\p{M}]+/gu) + // Match hashtags including hyphens, underscores, and unicode characters + // But stop at whitespace or common punctuation + const matches = content.match(/#[\p{L}\p{N}\p{M}_-]+/gu) matches?.forEach((m) => { - const hashtag = m.slice(1).toLowerCase() - if (hashtag) { - hashtags.push(hashtag) + const hashtag = m.slice(1) + // Use shared normalization function (without space replacement for content hashtags) + const normalized = normalizeHashtag(hashtag, false) + + // Only add if not empty (normalizeHashtag already filters out pure numbers) + if (normalized) { + hashtags.push(normalized) } }) return hashtags diff --git a/src/pages/primary/DiscussionsPage/index.tsx b/src/pages/primary/DiscussionsPage/index.tsx index de97efe..e6d8a62 100644 --- a/src/pages/primary/DiscussionsPage/index.tsx +++ b/src/pages/primary/DiscussionsPage/index.tsx @@ -1,8 +1,9 @@ import { Button } from '@/components/ui/button' import { Card, CardContent } from '@/components/ui/card' // Removed dropdown menu import - no longer using relay selection -import { FAST_READ_RELAY_URLS } from '@/constants' +import { FAST_READ_RELAY_URLS, HASHTAG_REGEX } from '@/constants' import { normalizeUrl } from '@/lib/url' +import { normalizeTopic } from '@/lib/discussion-topics' import { useFavoriteRelays } from '@/providers/FavoriteRelaysProvider' import { useNostr } from '@/providers/NostrProvider' import { forwardRef, useEffect, useState, useCallback, useRef } from 'react' @@ -22,89 +23,6 @@ import { useSecondaryPage } from '@/PageManager' import { toNote } from '@/lib/link' import { kinds } from 'nostr-tools' -// Normalize subtopic hashtags using linguistic rules to group similar variations -function normalizeSubtopic(tag: string): string { - let normalized = tag.toLowerCase().trim() - - // Don't normalize very short words (2 chars or less) - if (normalized.length <= 2) { - return normalized - } - - // Don't normalize compound hashtags (with hyphens or underscores) - if (normalized.includes('-') || normalized.includes('_')) { - return normalized - } - - // Handle common suffixes to find root forms - - // Remove trailing 's' for plurals (but not if word ends in 'ss') - if (normalized.endsWith('s') && !normalized.endsWith('ss')) { - // Special cases for words ending in 'ies' -> 'y' (e.g., stories -> story) - if (normalized.endsWith('ies') && normalized.length > 4) { - return normalized.slice(0, -3) + 'y' - } - // Special cases for words ending in 'es' (e.g., churches -> church, but not always) - if (normalized.endsWith('ches') || normalized.endsWith('shes') || normalized.endsWith('xes') || - normalized.endsWith('zes') || normalized.endsWith('ses')) { - return normalized.slice(0, -2) - } - // Regular plural: just remove 's' - return normalized.slice(0, -1) - } - - // Handle -ing forms (e.g., reading -> read, cooking -> cook) - if (normalized.endsWith('ing') && normalized.length > 5) { - const root = normalized.slice(0, -3) - // Handle doubled consonants (e.g., running -> run, shopping -> shop) - if (root.length >= 2 && root[root.length - 1] === root[root.length - 2]) { - return root.slice(0, -1) - } - return root - } - - // Handle -ed forms (e.g., deleted -> delete) - if (normalized.endsWith('ed') && normalized.length > 4) { - const root = normalized.slice(0, -2) - // Handle doubled consonants - if (root.length >= 2 && root[root.length - 1] === root[root.length - 2]) { - return root.slice(0, -1) - } - return root - } - - // Handle -er forms (e.g., developer -> develop, but not 'user' -> 'us') - if (normalized.endsWith('er') && normalized.length > 4 && !normalized.endsWith('eer')) { - return normalized.slice(0, -2) - } - - // Handle -ly adverbs (e.g., quickly -> quick) - if (normalized.endsWith('ly') && normalized.length > 4) { - return normalized.slice(0, -2) - } - - // Handle -y to -ies (e.g., philosophy/philosophical, economy/economics) - // Already handled by the 'ies' -> 'y' rule above - - // Handle -ism, -ist, -ian variations (e.g., Buddhism/Buddhist, Christian/Christianity) - if (normalized.endsWith('ism') && normalized.length > 5) { - return normalized.slice(0, -3) - } - if (normalized.endsWith('ist') && normalized.length > 5) { - return normalized.slice(0, -3) - } - if (normalized.endsWith('ity') && normalized.length > 5) { - return normalized.slice(0, -3) - } - if (normalized.endsWith('ian') && normalized.length > 5) { - return normalized.slice(0, -3) - } - if (normalized.endsWith('ians') && normalized.length > 6) { - return normalized.slice(0, -4) - } - - return normalized -} // Function to determine topic based on actual t-tags and hashtags function getTopicFromTags(allTopics: string[], availableTopicIds: string[]): string { @@ -364,8 +282,8 @@ const DiscussionsPage = forwardRef((_, ref) => { // Extract topics - normalize subtopics but keep originals for topic detection const tTagsRaw = event.tags.filter(tag => tag[0] === 't' && tag[1]).map(tag => tag[1].toLowerCase()) - // Match hashtags with letters, numbers, hyphens, and underscores - const hashtagsRaw = (event.content.match(/#[\w-]+/g) || []).map(tag => tag.slice(1).toLowerCase()) + // Match hashtags using the same regex as everywhere else + const hashtagsRaw = (event.content.match(HASHTAG_REGEX) || []).map(tag => tag.slice(1).toLowerCase()) const allTopicsRaw = [...new Set([...tTagsRaw, ...hashtagsRaw])] // Determine the main topic from raw tags (use only predefined topics during fetch) @@ -373,8 +291,8 @@ const DiscussionsPage = forwardRef((_, ref) => { const categorizedTopic = getTopicFromTags(allTopicsRaw, predefinedTopicIds) // Normalize subtopics for grouping (but not main topic IDs) - const tTags = tTagsRaw.map(tag => normalizeSubtopic(tag)) - const hashtags = hashtagsRaw.map(tag => normalizeSubtopic(tag)) + const tTags = tTagsRaw.map(tag => normalizeTopic(tag)) + const hashtags = hashtagsRaw.map(tag => normalizeTopic(tag)) const allTopics = [...new Set([...tTags, ...hashtags])] finalEventMap.set(eventId, { @@ -574,16 +492,16 @@ const DiscussionsPage = forwardRef((_, ref) => { // Extract topics from the published event const tTagsRaw = publishedEvent.tags.filter(tag => tag[0] === 't' && tag[1]).map(tag => tag[1].toLowerCase()) - const hashtagsRaw = (publishedEvent.content.match(/#[\w-]+/g) || []).map(tag => tag.slice(1).toLowerCase()) + const hashtagsRaw = (publishedEvent.content.match(HASHTAG_REGEX) || []).map(tag => tag.slice(1).toLowerCase()) const allTopicsRaw = [...new Set([...tTagsRaw, ...hashtagsRaw])] // Determine the main topic from raw tags const predefinedTopicIds = DISCUSSION_TOPICS.map(t => t.id) const categorizedTopic = getTopicFromTags(allTopicsRaw, predefinedTopicIds) - // Normalize subtopics for grouping - const tTags = tTagsRaw.map(tag => normalizeSubtopic(tag)) - const hashtags = hashtagsRaw.map(tag => normalizeSubtopic(tag)) + // Normalize subtopics for grouping using the same function as ThreadCard + const tTags = tTagsRaw.map(tag => normalizeTopic(tag)) + const hashtags = hashtagsRaw.map(tag => normalizeTopic(tag)) const allTopics = [...new Set([...tTags, ...hashtags])] // Get relay sources from event hints (tracked during publishing)