normalizing hashtags

8 months ago · 3fb3526194
4 changed files with 60 additions and 113 deletions
--- a/src/constants.ts
+++ b/src/constants.ts
@ -160,7 +160,7 @@ export const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/
				@@ -160,7 +160,7 @@ export const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/
 export const EMOJI_SHORT_CODE_REGEX = /:[a-zA-Z0-9_-]+:/g
 export const EMBEDDED_EVENT_REGEX = /nostr:(note1[a-z0-9]{58}|nevent1[a-z0-9]+|naddr1[a-z0-9]+)/g
 export const EMBEDDED_MENTION_REGEX = /nostr:(npub1[a-z0-9]{58}|nprofile1[a-z0-9]+)/g
-export const HASHTAG_REGEX = /#[a-zA-Z0-9_\u00C0-\u017F\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]+/g
+export const HASHTAG_REGEX = /#[a-zA-Z0-9_\-\u00C0-\u017F\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]+/g
 export const LN_INVOICE_REGEX = /(ln(?:bc|tb|bcrt))([0-9]+[munp]?)?1([02-9ac-hj-np-z]+)/g
 export const EMOJI_REGEX =
  /[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA70}-\u{1FAFF}]|[\u{1F004}]|[\u{1F0CF}]|[\u{1F18E}]|[\u{3030}]|[\u{2B50}]|[\u{2B55}]|[\u{2934}-\u{2935}]|[\u{2B05}-\u{2B07}]|[\u{2B1B}-\u{2B1C}]|[\u{3297}]|[\u{3299}]|[\u{303D}]|[\u{00A9}]|[\u{00AE}]|[\u{2122}]|[\u{23E9}-\u{23EF}]|[\u{23F0}]|[\u{23F3}]|[\u{FE00}-\u{FE0F}]|[\u{200D}]/gu
--- a/src/lib/discussion-topics.ts
+++ b/src/lib/discussion-topics.ts
@ -2,39 +2,53 @@ import { HASHTAG_REGEX } from '@/constants'
				@@ -2,39 +2,53 @@ import { HASHTAG_REGEX } from '@/constants'
 import { NostrEvent } from 'nostr-tools'

 /**
- * Normalize a topic string to lowercase with hyphens, no spaces
- * Also converts plurals to singular form
+ * Normalize a hashtag/topic string
+ * @param text The text to normalize
+ * @param replaceSpaces Whether to replace spaces with hyphens (true for t-tags, false for content hashtags)
+ * @returns Normalized string (lowercase, filtered characters, singular form)
 */
-export function normalizeTopic(topic: string): string {
-  let normalized = topic
-    .toLowerCase()
-    .replace(/\s+/g, '-')
-    .replace(/[^a-z0-9-]/g, '')
-    .replace(/-+/g, '-')
-    .replace(/^-|-$/g, '')
+export function normalizeHashtag(text: string, replaceSpaces: boolean = true): string {
+  // Convert to lowercase and optionally replace spaces with hyphens
+  let normalized = text.toLowerCase()
+  if (replaceSpaces) {
+    normalized = normalized.replace(/\s+/g, '-')
+  }
+  
+  // Only allow letters, numbers, hyphens, and underscores
+  normalized = normalized.replace(/[^a-z0-9_-]/g, '')
+  
+  // Clean up multiple consecutive hyphens/underscores
+  normalized = normalized.replace(/[-_]+/g, '-')
+  
+  // Remove leading/trailing hyphens/underscores
+  normalized = normalized.replace(/^[-_]+|[-_]+$/g, '')
+  
+  // Reject hashtags that are only numbers
+  if (/^[0-9]+$/.test(normalized)) {
+    return ''
+  }
+  
+  // Reject empty strings
+  if (!normalized) {
+    return ''
+  }
  
  // Convert plural to singular (simple English plurals)
  // Handle common cases: -ies -> -y, -es -> (sometimes), -s -> remove
  if (normalized.endsWith('ies') && normalized.length > 4) {
    // cities -> city, berries -> berry
    normalized = normalized.slice(0, -3) + 'y'
-  } else if (normalized.endsWith('ves') && normalized.length > 4) {
-    // wives -> wife, knives -> knife
-    normalized = normalized.slice(0, -3) + 'fe'
  } else if (normalized.endsWith('ses') && normalized.length > 4) {
    // classes -> class, bosses -> boss
    normalized = normalized.slice(0, -2)
  } else if (normalized.endsWith('xes') && normalized.length > 4) {
    // boxes -> box
    normalized = normalized.slice(0, -2)
-  } else if (normalized.endsWith('shes') && normalized.length > 5) {
-    // dishes -> dish
-    normalized = normalized.slice(0, -2)
  } else if (normalized.endsWith('ches') && normalized.length > 5) {
    // churches -> church
    normalized = normalized.slice(0, -2)
  } else if (normalized.endsWith('s') && normalized.length > 2) {
-    // Simple plural: cats -> cat, bitcoins -> bitcoin
+    // Simple plural: cats -> cat, bitcoins -> bitcoin, Christians -> Christian
    // But avoid removing 's' from words that naturally end in 's'
    // Check if second-to-last character is not 's' to avoid "ss" words
    const secondLast = normalized[normalized.length - 2]
@ -46,6 +60,14 @@ export function normalizeTopic(topic: string): string {
				@@ -46,6 +60,14 @@ export function normalizeTopic(topic: string): string {
  return normalized
 }

+/**
+ * Normalize a topic string (t-tags) - replaces spaces with hyphens
+ * Alias for normalizeHashtag with replaceSpaces=true
+ */
+export function normalizeTopic(topic: string): string {
+  return normalizeHashtag(topic, true)
+}
+
 /**
 * Extract hashtags from content
 */
--- a/src/lib/draft-event.ts
+++ b/src/lib/draft-event.ts
@ -3,6 +3,7 @@ import client from '@/services/client.service'
				@@ -3,6 +3,7 @@ import client from '@/services/client.service'
 import customEmojiService from '@/services/custom-emoji.service'
 import mediaUpload from '@/services/media-upload.service'
 import { prefixNostrAddresses } from '@/lib/nostr-address'
+import { normalizeHashtag } from '@/lib/discussion-topics'
 import {
  TDraftEvent,
  TEmoji,
@ -742,11 +743,17 @@ async function extractCommentMentions(content: string, parentEvent: Event) {
				@@ -742,11 +743,17 @@ async function extractCommentMentions(content: string, parentEvent: Event) {

 function extractHashtags(content: string) {
  const hashtags: string[] = []
-  const matches = content.match(/#[\p{L}\p{N}\p{M}]+/gu)
+  // Match hashtags including hyphens, underscores, and unicode characters
+  // But stop at whitespace or common punctuation
+  const matches = content.match(/#[\p{L}\p{N}\p{M}_-]+/gu)
  matches?.forEach((m) => {
-    const hashtag = m.slice(1).toLowerCase()
-    if (hashtag) {
-      hashtags.push(hashtag)
+    const hashtag = m.slice(1)
+    // Use shared normalization function (without space replacement for content hashtags)
+    const normalized = normalizeHashtag(hashtag, false)
+    
+    // Only add if not empty (normalizeHashtag already filters out pure numbers)
+    if (normalized) {
+      hashtags.push(normalized)
    }
  })
  return hashtags
--- a/src/pages/primary/DiscussionsPage/index.tsx
+++ b/src/pages/primary/DiscussionsPage/index.tsx
@ -1,8 +1,9 @@
				@@ -1,8 +1,9 @@
 import { Button } from '@/components/ui/button'
 import { Card, CardContent } from '@/components/ui/card'
 // Removed dropdown menu import - no longer using relay selection
-import { FAST_READ_RELAY_URLS } from '@/constants'
+import { FAST_READ_RELAY_URLS, HASHTAG_REGEX } from '@/constants'
 import { normalizeUrl } from '@/lib/url'
+import { normalizeTopic } from '@/lib/discussion-topics'
 import { useFavoriteRelays } from '@/providers/FavoriteRelaysProvider'
 import { useNostr } from '@/providers/NostrProvider'
 import { forwardRef, useEffect, useState, useCallback, useRef } from 'react'
@ -22,89 +23,6 @@ import { useSecondaryPage } from '@/PageManager'
				@@ -22,89 +23,6 @@ import { useSecondaryPage } from '@/PageManager'
 import { toNote } from '@/lib/link'
 import { kinds } from 'nostr-tools'

-// Normalize subtopic hashtags using linguistic rules to group similar variations
-function normalizeSubtopic(tag: string): string {
-  let normalized = tag.toLowerCase().trim()
-  
-  // Don't normalize very short words (2 chars or less)
-  if (normalized.length <= 2) {
-    return normalized
-  }
-  
-  // Don't normalize compound hashtags (with hyphens or underscores)
-  if (normalized.includes('-') || normalized.includes('_')) {
-    return normalized
-  }
-  
-  // Handle common suffixes to find root forms
-  
-  // Remove trailing 's' for plurals (but not if word ends in 'ss')
-  if (normalized.endsWith('s') && !normalized.endsWith('ss')) {
-    // Special cases for words ending in 'ies' -> 'y' (e.g., stories -> story)
-    if (normalized.endsWith('ies') && normalized.length > 4) {
-      return normalized.slice(0, -3) + 'y'
-    }
-    // Special cases for words ending in 'es' (e.g., churches -> church, but not always)
-    if (normalized.endsWith('ches') || normalized.endsWith('shes') || normalized.endsWith('xes') || 
-        normalized.endsWith('zes') || normalized.endsWith('ses')) {
-      return normalized.slice(0, -2)
-    }
-    // Regular plural: just remove 's'
-    return normalized.slice(0, -1)
-  }
-  
-  // Handle -ing forms (e.g., reading -> read, cooking -> cook)
-  if (normalized.endsWith('ing') && normalized.length > 5) {
-    const root = normalized.slice(0, -3)
-    // Handle doubled consonants (e.g., running -> run, shopping -> shop)
-    if (root.length >= 2 && root[root.length - 1] === root[root.length - 2]) {
-      return root.slice(0, -1)
-    }
-    return root
-  }
-  
-  // Handle -ed forms (e.g., deleted -> delete)
-  if (normalized.endsWith('ed') && normalized.length > 4) {
-    const root = normalized.slice(0, -2)
-    // Handle doubled consonants
-    if (root.length >= 2 && root[root.length - 1] === root[root.length - 2]) {
-      return root.slice(0, -1)
-    }
-    return root
-  }
-  
-  // Handle -er forms (e.g., developer -> develop, but not 'user' -> 'us')
-  if (normalized.endsWith('er') && normalized.length > 4 && !normalized.endsWith('eer')) {
-    return normalized.slice(0, -2)
-  }
-  
-  // Handle -ly adverbs (e.g., quickly -> quick)
-  if (normalized.endsWith('ly') && normalized.length > 4) {
-    return normalized.slice(0, -2)
-  }
-  
-  // Handle -y to -ies (e.g., philosophy/philosophical, economy/economics)
-  // Already handled by the 'ies' -> 'y' rule above
-  
-  // Handle -ism, -ist, -ian variations (e.g., Buddhism/Buddhist, Christian/Christianity)
-  if (normalized.endsWith('ism') && normalized.length > 5) {
-    return normalized.slice(0, -3)
-  }
-  if (normalized.endsWith('ist') && normalized.length > 5) {
-    return normalized.slice(0, -3)
-  }
-  if (normalized.endsWith('ity') && normalized.length > 5) {
-    return normalized.slice(0, -3)
-  }
-  if (normalized.endsWith('ian') && normalized.length > 5) {
-    return normalized.slice(0, -3)
-  }
-  if (normalized.endsWith('ians') && normalized.length > 6) {
-    return normalized.slice(0, -4)
-  }
-  
-  return normalized
-}

 // Function to determine topic based on actual t-tags and hashtags
 function getTopicFromTags(allTopics: string[], availableTopicIds: string[]): string {
@ -364,8 +282,8 @@ const DiscussionsPage = forwardRef((_, ref) => {
				@@ -364,8 +282,8 @@ const DiscussionsPage = forwardRef((_, ref) => {
        
        // Extract topics - normalize subtopics but keep originals for topic detection
        const tTagsRaw = event.tags.filter(tag => tag[0] === 't' && tag[1]).map(tag => tag[1].toLowerCase())
-        // Match hashtags with letters, numbers, hyphens, and underscores
-        const hashtagsRaw = (event.content.match(/#[\w-]+/g) || []).map(tag => tag.slice(1).toLowerCase())
+        // Match hashtags using the same regex as everywhere else
+        const hashtagsRaw = (event.content.match(HASHTAG_REGEX) || []).map(tag => tag.slice(1).toLowerCase())
        const allTopicsRaw = [...new Set([...tTagsRaw, ...hashtagsRaw])]
        
        // Determine the main topic from raw tags (use only predefined topics during fetch)
@ -373,8 +291,8 @@ const DiscussionsPage = forwardRef((_, ref) => {
				@@ -373,8 +291,8 @@ const DiscussionsPage = forwardRef((_, ref) => {
        const categorizedTopic = getTopicFromTags(allTopicsRaw, predefinedTopicIds)
        
        // Normalize subtopics for grouping (but not main topic IDs)
-        const tTags = tTagsRaw.map(tag => normalizeSubtopic(tag))
-        const hashtags = hashtagsRaw.map(tag => normalizeSubtopic(tag))
+        const tTags = tTagsRaw.map(tag => normalizeTopic(tag))
+        const hashtags = hashtagsRaw.map(tag => normalizeTopic(tag))
        const allTopics = [...new Set([...tTags, ...hashtags])]
        
        finalEventMap.set(eventId, {
@ -574,16 +492,16 @@ const DiscussionsPage = forwardRef((_, ref) => {
				@@ -574,16 +492,16 @@ const DiscussionsPage = forwardRef((_, ref) => {
      
      // Extract topics from the published event
      const tTagsRaw = publishedEvent.tags.filter(tag => tag[0] === 't' && tag[1]).map(tag => tag[1].toLowerCase())
-      const hashtagsRaw = (publishedEvent.content.match(/#[\w-]+/g) || []).map(tag => tag.slice(1).toLowerCase())
+      const hashtagsRaw = (publishedEvent.content.match(HASHTAG_REGEX) || []).map(tag => tag.slice(1).toLowerCase())
      const allTopicsRaw = [...new Set([...tTagsRaw, ...hashtagsRaw])]
      
      // Determine the main topic from raw tags
      const predefinedTopicIds = DISCUSSION_TOPICS.map(t => t.id)
      const categorizedTopic = getTopicFromTags(allTopicsRaw, predefinedTopicIds)
      
-      // Normalize subtopics for grouping
-      const tTags = tTagsRaw.map(tag => normalizeSubtopic(tag))
-      const hashtags = hashtagsRaw.map(tag => normalizeSubtopic(tag))
+      // Normalize subtopics for grouping using the same function as ThreadCard
+      const tTags = tTagsRaw.map(tag => normalizeTopic(tag))
+      const hashtags = hashtagsRaw.map(tag => normalizeTopic(tag))
        const allTopics = [...new Set([...tTags, ...hashtags])]
      
      // Get relay sources from event hints (tracked during publishing)