Browse Source

normalizing hashtags

imwald
Silberengel 5 months ago
parent
commit
3fb3526194
  1. 2
      src/constants.ts
  2. 54
      src/lib/discussion-topics.ts
  3. 15
      src/lib/draft-event.ts
  4. 102
      src/pages/primary/DiscussionsPage/index.tsx

2
src/constants.ts

@ -160,7 +160,7 @@ export const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/ @@ -160,7 +160,7 @@ export const EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/
export const EMOJI_SHORT_CODE_REGEX = /:[a-zA-Z0-9_-]+:/g
export const EMBEDDED_EVENT_REGEX = /nostr:(note1[a-z0-9]{58}|nevent1[a-z0-9]+|naddr1[a-z0-9]+)/g
export const EMBEDDED_MENTION_REGEX = /nostr:(npub1[a-z0-9]{58}|nprofile1[a-z0-9]+)/g
export const HASHTAG_REGEX = /#[a-zA-Z0-9_\u00C0-\u017F\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]+/g
export const HASHTAG_REGEX = /#[a-zA-Z0-9_\-\u00C0-\u017F\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]+/g
export const LN_INVOICE_REGEX = /(ln(?:bc|tb|bcrt))([0-9]+[munp]?)?1([02-9ac-hj-np-z]+)/g
export const EMOJI_REGEX =
/[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA70}-\u{1FAFF}]|[\u{1F004}]|[\u{1F0CF}]|[\u{1F18E}]|[\u{3030}]|[\u{2B50}]|[\u{2B55}]|[\u{2934}-\u{2935}]|[\u{2B05}-\u{2B07}]|[\u{2B1B}-\u{2B1C}]|[\u{3297}]|[\u{3299}]|[\u{303D}]|[\u{00A9}]|[\u{00AE}]|[\u{2122}]|[\u{23E9}-\u{23EF}]|[\u{23F0}]|[\u{23F3}]|[\u{FE00}-\u{FE0F}]|[\u{200D}]/gu

54
src/lib/discussion-topics.ts

@ -2,39 +2,53 @@ import { HASHTAG_REGEX } from '@/constants' @@ -2,39 +2,53 @@ import { HASHTAG_REGEX } from '@/constants'
import { NostrEvent } from 'nostr-tools'
/**
* Normalize a topic string to lowercase with hyphens, no spaces
* Also converts plurals to singular form
* Normalize a hashtag/topic string
* @param text The text to normalize
* @param replaceSpaces Whether to replace spaces with hyphens (true for t-tags, false for content hashtags)
* @returns Normalized string (lowercase, filtered characters, singular form)
*/
export function normalizeTopic(topic: string): string {
let normalized = topic
.toLowerCase()
.replace(/\s+/g, '-')
.replace(/[^a-z0-9-]/g, '')
.replace(/-+/g, '-')
.replace(/^-|-$/g, '')
export function normalizeHashtag(text: string, replaceSpaces: boolean = true): string {
// Convert to lowercase and optionally replace spaces with hyphens
let normalized = text.toLowerCase()
if (replaceSpaces) {
normalized = normalized.replace(/\s+/g, '-')
}
// Only allow letters, numbers, hyphens, and underscores
normalized = normalized.replace(/[^a-z0-9_-]/g, '')
// Clean up multiple consecutive hyphens/underscores
normalized = normalized.replace(/[-_]+/g, '-')
// Remove leading/trailing hyphens/underscores
normalized = normalized.replace(/^[-_]+|[-_]+$/g, '')
// Reject hashtags that are only numbers
if (/^[0-9]+$/.test(normalized)) {
return ''
}
// Reject empty strings
if (!normalized) {
return ''
}
// Convert plural to singular (simple English plurals)
// Handle common cases: -ies -> -y, -es -> (sometimes), -s -> remove
if (normalized.endsWith('ies') && normalized.length > 4) {
// cities -> city, berries -> berry
normalized = normalized.slice(0, -3) + 'y'
} else if (normalized.endsWith('ves') && normalized.length > 4) {
// wives -> wife, knives -> knife
normalized = normalized.slice(0, -3) + 'fe'
} else if (normalized.endsWith('ses') && normalized.length > 4) {
// classes -> class, bosses -> boss
normalized = normalized.slice(0, -2)
} else if (normalized.endsWith('xes') && normalized.length > 4) {
// boxes -> box
normalized = normalized.slice(0, -2)
} else if (normalized.endsWith('shes') && normalized.length > 5) {
// dishes -> dish
normalized = normalized.slice(0, -2)
} else if (normalized.endsWith('ches') && normalized.length > 5) {
// churches -> church
normalized = normalized.slice(0, -2)
} else if (normalized.endsWith('s') && normalized.length > 2) {
// Simple plural: cats -> cat, bitcoins -> bitcoin
// Simple plural: cats -> cat, bitcoins -> bitcoin, Christians -> Christian
// But avoid removing 's' from words that naturally end in 's'
// Check if second-to-last character is not 's' to avoid "ss" words
const secondLast = normalized[normalized.length - 2]
@ -46,6 +60,14 @@ export function normalizeTopic(topic: string): string { @@ -46,6 +60,14 @@ export function normalizeTopic(topic: string): string {
return normalized
}
/**
* Normalize a topic string (t-tags) - replaces spaces with hyphens
* Alias for normalizeHashtag with replaceSpaces=true
*/
export function normalizeTopic(topic: string): string {
return normalizeHashtag(topic, true)
}
/**
* Extract hashtags from content
*/

15
src/lib/draft-event.ts

@ -3,6 +3,7 @@ import client from '@/services/client.service' @@ -3,6 +3,7 @@ import client from '@/services/client.service'
import customEmojiService from '@/services/custom-emoji.service'
import mediaUpload from '@/services/media-upload.service'
import { prefixNostrAddresses } from '@/lib/nostr-address'
import { normalizeHashtag } from '@/lib/discussion-topics'
import {
TDraftEvent,
TEmoji,
@ -742,11 +743,17 @@ async function extractCommentMentions(content: string, parentEvent: Event) { @@ -742,11 +743,17 @@ async function extractCommentMentions(content: string, parentEvent: Event) {
function extractHashtags(content: string) {
const hashtags: string[] = []
const matches = content.match(/#[\p{L}\p{N}\p{M}]+/gu)
// Match hashtags including hyphens, underscores, and unicode characters
// But stop at whitespace or common punctuation
const matches = content.match(/#[\p{L}\p{N}\p{M}_-]+/gu)
matches?.forEach((m) => {
const hashtag = m.slice(1).toLowerCase()
if (hashtag) {
hashtags.push(hashtag)
const hashtag = m.slice(1)
// Use shared normalization function (without space replacement for content hashtags)
const normalized = normalizeHashtag(hashtag, false)
// Only add if not empty (normalizeHashtag already filters out pure numbers)
if (normalized) {
hashtags.push(normalized)
}
})
return hashtags

102
src/pages/primary/DiscussionsPage/index.tsx

@ -1,8 +1,9 @@ @@ -1,8 +1,9 @@
import { Button } from '@/components/ui/button'
import { Card, CardContent } from '@/components/ui/card'
// Removed dropdown menu import - no longer using relay selection
import { FAST_READ_RELAY_URLS } from '@/constants'
import { FAST_READ_RELAY_URLS, HASHTAG_REGEX } from '@/constants'
import { normalizeUrl } from '@/lib/url'
import { normalizeTopic } from '@/lib/discussion-topics'
import { useFavoriteRelays } from '@/providers/FavoriteRelaysProvider'
import { useNostr } from '@/providers/NostrProvider'
import { forwardRef, useEffect, useState, useCallback, useRef } from 'react'
@ -22,89 +23,6 @@ import { useSecondaryPage } from '@/PageManager' @@ -22,89 +23,6 @@ import { useSecondaryPage } from '@/PageManager'
import { toNote } from '@/lib/link'
import { kinds } from 'nostr-tools'
// Normalize subtopic hashtags using linguistic rules to group similar variations
function normalizeSubtopic(tag: string): string {
let normalized = tag.toLowerCase().trim()
// Don't normalize very short words (2 chars or less)
if (normalized.length <= 2) {
return normalized
}
// Don't normalize compound hashtags (with hyphens or underscores)
if (normalized.includes('-') || normalized.includes('_')) {
return normalized
}
// Handle common suffixes to find root forms
// Remove trailing 's' for plurals (but not if word ends in 'ss')
if (normalized.endsWith('s') && !normalized.endsWith('ss')) {
// Special cases for words ending in 'ies' -> 'y' (e.g., stories -> story)
if (normalized.endsWith('ies') && normalized.length > 4) {
return normalized.slice(0, -3) + 'y'
}
// Special cases for words ending in 'es' (e.g., churches -> church, but not always)
if (normalized.endsWith('ches') || normalized.endsWith('shes') || normalized.endsWith('xes') ||
normalized.endsWith('zes') || normalized.endsWith('ses')) {
return normalized.slice(0, -2)
}
// Regular plural: just remove 's'
return normalized.slice(0, -1)
}
// Handle -ing forms (e.g., reading -> read, cooking -> cook)
if (normalized.endsWith('ing') && normalized.length > 5) {
const root = normalized.slice(0, -3)
// Handle doubled consonants (e.g., running -> run, shopping -> shop)
if (root.length >= 2 && root[root.length - 1] === root[root.length - 2]) {
return root.slice(0, -1)
}
return root
}
// Handle -ed forms (e.g., deleted -> delete)
if (normalized.endsWith('ed') && normalized.length > 4) {
const root = normalized.slice(0, -2)
// Handle doubled consonants
if (root.length >= 2 && root[root.length - 1] === root[root.length - 2]) {
return root.slice(0, -1)
}
return root
}
// Handle -er forms (e.g., developer -> develop, but not 'user' -> 'us')
if (normalized.endsWith('er') && normalized.length > 4 && !normalized.endsWith('eer')) {
return normalized.slice(0, -2)
}
// Handle -ly adverbs (e.g., quickly -> quick)
if (normalized.endsWith('ly') && normalized.length > 4) {
return normalized.slice(0, -2)
}
// Handle -y to -ies (e.g., philosophy/philosophical, economy/economics)
// Already handled by the 'ies' -> 'y' rule above
// Handle -ism, -ist, -ian variations (e.g., Buddhism/Buddhist, Christian/Christianity)
if (normalized.endsWith('ism') && normalized.length > 5) {
return normalized.slice(0, -3)
}
if (normalized.endsWith('ist') && normalized.length > 5) {
return normalized.slice(0, -3)
}
if (normalized.endsWith('ity') && normalized.length > 5) {
return normalized.slice(0, -3)
}
if (normalized.endsWith('ian') && normalized.length > 5) {
return normalized.slice(0, -3)
}
if (normalized.endsWith('ians') && normalized.length > 6) {
return normalized.slice(0, -4)
}
return normalized
}
// Function to determine topic based on actual t-tags and hashtags
function getTopicFromTags(allTopics: string[], availableTopicIds: string[]): string {
@ -364,8 +282,8 @@ const DiscussionsPage = forwardRef((_, ref) => { @@ -364,8 +282,8 @@ const DiscussionsPage = forwardRef((_, ref) => {
// Extract topics - normalize subtopics but keep originals for topic detection
const tTagsRaw = event.tags.filter(tag => tag[0] === 't' && tag[1]).map(tag => tag[1].toLowerCase())
// Match hashtags with letters, numbers, hyphens, and underscores
const hashtagsRaw = (event.content.match(/#[\w-]+/g) || []).map(tag => tag.slice(1).toLowerCase())
// Match hashtags using the same regex as everywhere else
const hashtagsRaw = (event.content.match(HASHTAG_REGEX) || []).map(tag => tag.slice(1).toLowerCase())
const allTopicsRaw = [...new Set([...tTagsRaw, ...hashtagsRaw])]
// Determine the main topic from raw tags (use only predefined topics during fetch)
@ -373,8 +291,8 @@ const DiscussionsPage = forwardRef((_, ref) => { @@ -373,8 +291,8 @@ const DiscussionsPage = forwardRef((_, ref) => {
const categorizedTopic = getTopicFromTags(allTopicsRaw, predefinedTopicIds)
// Normalize subtopics for grouping (but not main topic IDs)
const tTags = tTagsRaw.map(tag => normalizeSubtopic(tag))
const hashtags = hashtagsRaw.map(tag => normalizeSubtopic(tag))
const tTags = tTagsRaw.map(tag => normalizeTopic(tag))
const hashtags = hashtagsRaw.map(tag => normalizeTopic(tag))
const allTopics = [...new Set([...tTags, ...hashtags])]
finalEventMap.set(eventId, {
@ -574,16 +492,16 @@ const DiscussionsPage = forwardRef((_, ref) => { @@ -574,16 +492,16 @@ const DiscussionsPage = forwardRef((_, ref) => {
// Extract topics from the published event
const tTagsRaw = publishedEvent.tags.filter(tag => tag[0] === 't' && tag[1]).map(tag => tag[1].toLowerCase())
const hashtagsRaw = (publishedEvent.content.match(/#[\w-]+/g) || []).map(tag => tag.slice(1).toLowerCase())
const hashtagsRaw = (publishedEvent.content.match(HASHTAG_REGEX) || []).map(tag => tag.slice(1).toLowerCase())
const allTopicsRaw = [...new Set([...tTagsRaw, ...hashtagsRaw])]
// Determine the main topic from raw tags
const predefinedTopicIds = DISCUSSION_TOPICS.map(t => t.id)
const categorizedTopic = getTopicFromTags(allTopicsRaw, predefinedTopicIds)
// Normalize subtopics for grouping
const tTags = tTagsRaw.map(tag => normalizeSubtopic(tag))
const hashtags = hashtagsRaw.map(tag => normalizeSubtopic(tag))
// Normalize subtopics for grouping using the same function as ThreadCard
const tTags = tTagsRaw.map(tag => normalizeTopic(tag))
const hashtags = hashtagsRaw.map(tag => normalizeTopic(tag))
const allTopics = [...new Set([...tTags, ...hashtags])]
// Get relay sources from event hints (tracked during publishing)

Loading…
Cancel
Save