You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1761 lines
66 KiB
1761 lines
66 KiB
import { DEFAULT_RSS_FEEDS } from '@/constants' |
|
import { canonicalizeRssArticleUrl } from '@/lib/rss-article' |
|
import { cleanUrl } from '@/lib/url' |
|
import logger from '@/lib/logger' |
|
import { buildViteProxySitesFetchUrl, urlLooksLikeViteProxyRequest } from '@/lib/vite-proxy-url' |
|
import indexedDb from '@/services/indexed-db.service' |
|
|
|
export interface RssFeedItemMedia { |
|
url: string |
|
type?: string |
|
credit?: string |
|
thumbnail?: string |
|
width?: string |
|
height?: string |
|
} |
|
|
|
export interface RssFeedItemEnclosure { |
|
url: string |
|
type: string |
|
length?: string |
|
duration?: string |
|
} |
|
|
|
export interface RssFeedItem { |
|
title: string |
|
link: string |
|
description: string |
|
pubDate: Date | null |
|
guid: string |
|
feedUrl: string |
|
feedTitle?: string |
|
feedImage?: string |
|
feedDescription?: string |
|
media?: RssFeedItemMedia[] |
|
enclosure?: RssFeedItemEnclosure |
|
} |
|
|
|
export interface RssFeed { |
|
title: string |
|
link: string |
|
description: string |
|
items: RssFeedItem[] |
|
feedUrl: string |
|
image?: { |
|
url?: string |
|
title?: string |
|
link?: string |
|
width?: string |
|
height?: string |
|
description?: string |
|
} |
|
language?: string |
|
copyright?: string |
|
generator?: string |
|
lastBuildDate?: Date |
|
} |
|
|
|
/** Synthetic row for URL-only threads (Nostr activity on a link without an RSS cache hit). */ |
|
export const WEB_ONLY_FAUX_FEED_URL = 'nostr:jumble/web-faux-rss-item' |
|
|
|
export function isWebOnlyFauxRssItem(item: Pick<RssFeedItem, 'feedUrl' | 'guid'>): boolean { |
|
return item.feedUrl === WEB_ONLY_FAUX_FEED_URL || item.guid.startsWith('web-only:') |
|
} |
|
|
|
export function createWebOnlyRssFeedItem(articleUrl: string): RssFeedItem { |
|
const canonical = canonicalizeRssArticleUrl(articleUrl.trim()) |
|
return { |
|
title: canonical, |
|
link: canonical, |
|
description: '', |
|
pubDate: null, |
|
guid: `web-only:${canonical}`, |
|
feedUrl: WEB_ONLY_FAUX_FEED_URL, |
|
feedTitle: undefined |
|
} |
|
} |
|
|
|
const RSS_FEED_FETCH_ATTEMPTED_KEYS_SETTING = 'rssFeedFetchAttemptedKeys' |
|
|
|
class RssFeedService { |
|
static instance: RssFeedService |
|
private feedCache: Map<string, { feed: RssFeed; timestamp: number }> = new Map() |
|
private readonly CACHE_DURATION = 5 * 60 * 1000 // 5 minutes |
|
private backgroundRefreshController: AbortController | null = null |
|
private monthMapCache: Record<string, string> | null = null |
|
private activeFetchPromises: Map<string, Promise<RssFeed>> = new Map() // Track active fetches by URL |
|
/** Global RSS item cap in IndexedDB; oldest by pubDate are removed when exceeded. */ |
|
private readonly MAX_CACHED_RSS_ITEMS = 5000 |
|
/** |
|
* Feed URLs we already tried to hydrate (success or hard failure). Without this, a feed that never |
|
* yields items (CORS, dead host) stays "missing" forever and blocks every load / retriggers refresh. |
|
* Persisted so a full reload does not repeat a 30s wait for the same dead URL. |
|
*/ |
|
private rssFeedAttemptedKeys = new Set<string>() |
|
private rssFeedAttemptedKeysLoaded = false |
|
/** Same feed list + overlapping time: one network refresh (Strict Mode / remount / HMR). */ |
|
private rssMultiFeedRefreshInFlight = new Map<string, Promise<void>>() |
|
|
|
private rssMultiFeedRefreshKey(feedUrls: string[]): string { |
|
return [...feedUrls].map((u) => this.normalizeRssFeedKeyUrl(u)).sort().join('\u0001') |
|
} |
|
|
|
constructor() { |
|
if (!RssFeedService.instance) { |
|
RssFeedService.instance = this |
|
} |
|
return RssFeedService.instance |
|
} |
|
|
|
private normalizeRssFeedKeyUrl(url: string): string { |
|
return url.trim().replace(/\/$/, '') |
|
} |
|
|
|
private async ensureRssFeedAttemptedKeysLoaded(): Promise<void> { |
|
if (this.rssFeedAttemptedKeysLoaded) return |
|
this.rssFeedAttemptedKeysLoaded = true |
|
try { |
|
const raw = await indexedDb.getSetting(RSS_FEED_FETCH_ATTEMPTED_KEYS_SETTING) |
|
if (!raw) return |
|
const parsed = JSON.parse(raw) as unknown |
|
if (!Array.isArray(parsed)) return |
|
for (const x of parsed) { |
|
if (typeof x === 'string' && x.trim()) { |
|
this.rssFeedAttemptedKeys.add(this.normalizeRssFeedKeyUrl(x)) |
|
} |
|
} |
|
} catch (e) { |
|
logger.warn('[RssFeedService] Failed to load attempted feed URL keys', { error: e }) |
|
} |
|
} |
|
|
|
private async persistRssFeedAttemptedKeys(): Promise<void> { |
|
try { |
|
await indexedDb.setSetting( |
|
RSS_FEED_FETCH_ATTEMPTED_KEYS_SETTING, |
|
JSON.stringify([...this.rssFeedAttemptedKeys]) |
|
) |
|
} catch (e) { |
|
logger.warn('[RssFeedService] Failed to persist attempted feed URL keys', { error: e }) |
|
} |
|
} |
|
|
|
private markFeedKeysAttempted(urls: string[]): void { |
|
for (const u of urls) { |
|
this.rssFeedAttemptedKeys.add(this.normalizeRssFeedKeyUrl(u)) |
|
} |
|
} |
|
|
|
private parseItemPubDate(item: RssFeedItem): Date | null { |
|
if (!item.pubDate) return null |
|
if (item.pubDate instanceof Date) return item.pubDate |
|
if (typeof item.pubDate === 'number') return new Date(item.pubDate) |
|
if (typeof item.pubDate === 'string') return new Date(item.pubDate) |
|
return null |
|
} |
|
|
|
/** |
|
* Merge refreshed feeds into the full IndexedDB cache, trim oldest items when over the cap, |
|
* and rewrite the store so pruned rows are removed (put-only would leave stale keys). |
|
*/ |
|
private async persistGlobalRssCacheAfterMerge( |
|
mergedFromRefresh: RssFeedItem[], |
|
refreshedFeedUrls: string[] |
|
): Promise<void> { |
|
const refreshedSet = new Set(refreshedFeedUrls.map((u) => this.normalizeRssFeedKeyUrl(u))) |
|
let all: RssFeedItem[] = [] |
|
try { |
|
all = await indexedDb.getRssFeedItems() |
|
} catch (e) { |
|
logger.warn('[RssFeedService] persistGlobalRssCacheAfterMerge: read cache failed', { error: e }) |
|
} |
|
const map = new Map<string, RssFeedItem>() |
|
for (const item of all) { |
|
const key = `${item.feedUrl}:${item.guid}` |
|
if (!refreshedSet.has(this.normalizeRssFeedKeyUrl(item.feedUrl))) { |
|
map.set(key, { |
|
...item, |
|
pubDate: this.parseItemPubDate(item) |
|
}) |
|
} |
|
} |
|
for (const item of mergedFromRefresh) { |
|
map.set(`${item.feedUrl}:${item.guid}`, item) |
|
} |
|
let combined = Array.from(map.values()) |
|
combined.sort((a, b) => { |
|
const dateA = a.pubDate?.getTime() || 0 |
|
const dateB = b.pubDate?.getTime() || 0 |
|
return dateB - dateA |
|
}) |
|
if (combined.length > this.MAX_CACHED_RSS_ITEMS) { |
|
combined = combined.slice(0, this.MAX_CACHED_RSS_ITEMS) |
|
} |
|
try { |
|
await indexedDb.clearRssFeedItems() |
|
await indexedDb.putRssFeedItems(combined) |
|
} catch (error) { |
|
logger.error('[RssFeedService] persistGlobalRssCacheAfterMerge failed', { error }) |
|
} |
|
} |
|
|
|
/** |
|
* Fetch and parse an RSS/Atom feed from a URL |
|
*/ |
|
async fetchFeed(url: string, signal?: AbortSignal): Promise<RssFeed> { |
|
// Check cache first |
|
const cached = this.feedCache.get(url) |
|
if (cached && Date.now() - cached.timestamp < this.CACHE_DURATION) { |
|
logger.debug('[RssFeedService] Returning cached feed', { url }) |
|
return cached.feed |
|
} |
|
|
|
// Check if already aborted |
|
if (signal?.aborted) { |
|
logger.warn('[RssFeedService] Signal already aborted before fetchFeed', { url }) |
|
throw new DOMException('The operation was aborted.', 'AbortError') |
|
} |
|
|
|
// Check if there's already an active fetch for this URL (deduplicate simultaneous requests) |
|
const activeFetch = this.activeFetchPromises.get(url) |
|
if (activeFetch) { |
|
logger.debug('[RssFeedService] Reusing active fetch for URL', { url }) |
|
return activeFetch |
|
} |
|
|
|
// Create fetch promise and track it |
|
const fetchPromise = (async () => { |
|
try { |
|
// Try multiple fetch strategies in order |
|
const strategies = this.getFetchStrategies(url) |
|
|
|
for (const strategy of strategies) { |
|
// Check if aborted before trying next strategy |
|
if (signal?.aborted) { |
|
logger.warn('[RssFeedService] Signal aborted during fetch strategies', { url, strategy: strategy.name }) |
|
throw new DOMException('The operation was aborted.', 'AbortError') |
|
} |
|
|
|
try { |
|
logger.debug('[RssFeedService] Trying fetch strategy', { url, strategy: strategy.name }) |
|
const xmlText = await this.fetchWithStrategy(url, strategy, signal) |
|
if (xmlText) { |
|
const feed = this.parseFeed(xmlText, url) |
|
// Cache the feed |
|
this.feedCache.set(url, { feed, timestamp: Date.now() }) |
|
logger.info('[RssFeedService] Successfully fetched and parsed feed', { |
|
url, |
|
itemCount: feed.items.length, |
|
strategy: strategy.name |
|
}) |
|
return feed |
|
} |
|
} catch (error) { |
|
// Don't log abort errors as warnings - they're expected during cleanup |
|
if (error instanceof DOMException && error.name === 'AbortError') { |
|
logger.warn('[RssFeedService] Fetch aborted', { url, strategy: strategy.name }) |
|
throw error // Re-throw abort errors immediately |
|
} |
|
logger.warn('[RssFeedService] Strategy failed', { url, strategy: strategy.name, error }) |
|
// Continue to next strategy |
|
continue |
|
} |
|
} |
|
|
|
// All strategies failed |
|
throw new Error(`Failed to fetch RSS feed from ${url} after trying all available methods`) |
|
} finally { |
|
// Remove from active fetches when done |
|
this.activeFetchPromises.delete(url) |
|
} |
|
})() |
|
|
|
// Store the promise to deduplicate simultaneous requests |
|
this.activeFetchPromises.set(url, fetchPromise) |
|
|
|
return fetchPromise |
|
} |
|
|
|
/** |
|
* Get list of fetch strategies to try in order |
|
*/ |
|
private getFetchStrategies(url: string): Array<{ name: string; getUrl: (url: string) => string }> { |
|
const strategies: Array<{ name: string; getUrl: (url: string) => string }> = [] |
|
|
|
// Strategy 1: Same `VITE_PROXY_SERVER` contract as OG/link preview (`sites/?url=…`), not path-encoded `/sites/{url}`. |
|
const proxyServer = import.meta.env.VITE_PROXY_SERVER?.trim() |
|
if (proxyServer && !urlLooksLikeViteProxyRequest(url)) { |
|
strategies.push({ |
|
name: 'configured-proxy', |
|
getUrl: (u) => buildViteProxySitesFetchUrl(u, proxyServer) |
|
}) |
|
} |
|
|
|
// Strategy 2: Use public CORS proxy (allorigins.win) |
|
strategies.push({ |
|
name: 'allorigins-proxy', |
|
getUrl: (url) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}` |
|
}) |
|
|
|
// Strategy 3: Alternative CORS proxy (corsproxy.io) |
|
strategies.push({ |
|
name: 'corsproxy-proxy', |
|
getUrl: (url) => `https://corsproxy.io/?${encodeURIComponent(url)}` |
|
}) |
|
|
|
// Strategy 4: Try direct fetch (may work for some feeds with CORS enabled) |
|
strategies.push({ |
|
name: 'direct', |
|
getUrl: (url) => url |
|
}) |
|
|
|
return strategies |
|
} |
|
|
|
/** |
|
* Fetch feed using a specific strategy |
|
*/ |
|
private async fetchWithStrategy(originalUrl: string, strategy: { name: string; getUrl: (url: string) => string }, externalSignal?: AbortSignal): Promise<string> { |
|
const fetchUrl = strategy.getUrl(originalUrl) |
|
|
|
// Check if external signal is already aborted |
|
if (externalSignal?.aborted) { |
|
throw new DOMException('The operation was aborted.', 'AbortError') |
|
} |
|
|
|
const controller = new AbortController() |
|
// Use a longer timeout for RSS feeds (30 seconds) since they can be slow |
|
// Don't abort on timeout - just log a warning, let the fetch continue |
|
const timeoutId = setTimeout(() => { |
|
logger.warn('[RssFeedService] Fetch taking longer than expected', { |
|
url: originalUrl, |
|
strategy: strategy.name, |
|
elapsed: '30s' |
|
}) |
|
// Don't abort - just log. The fetch will continue or fail naturally |
|
}, 30000) // 30 second warning (but don't abort) |
|
|
|
// If external signal is provided, abort our controller when external signal aborts |
|
if (externalSignal) { |
|
externalSignal.addEventListener('abort', () => { |
|
clearTimeout(timeoutId) |
|
controller.abort() |
|
}, { once: true }) |
|
} |
|
|
|
try { |
|
const res = await fetch(fetchUrl, { |
|
signal: controller.signal, |
|
mode: 'cors', |
|
credentials: 'omit', |
|
headers: { |
|
'Accept': 'application/rss+xml, application/xml, application/atom+xml, text/xml, */*' |
|
} |
|
}) |
|
|
|
clearTimeout(timeoutId) |
|
|
|
if (!res.ok) { |
|
throw new Error(`HTTP ${res.status}: ${res.statusText}`) |
|
} |
|
|
|
const xmlText = await res.text() |
|
|
|
// Validate that we got XML content |
|
if (!xmlText || xmlText.trim().length === 0) { |
|
throw new Error('Empty response') |
|
} |
|
|
|
// Basic validation - check if it looks like XML |
|
if (!xmlText.trim().startsWith('<')) { |
|
throw new Error('Response does not appear to be XML') |
|
} |
|
|
|
return xmlText |
|
} catch (error) { |
|
clearTimeout(timeoutId) |
|
// Re-throw abort errors as-is |
|
if (error instanceof DOMException && error.name === 'AbortError') { |
|
throw error |
|
} |
|
throw error |
|
} |
|
} |
|
|
|
/** |
|
* Parse RSS/Atom XML into structured data |
|
*/ |
|
private parseFeed(xmlText: string, feedUrl: string): RssFeed { |
|
const parser = new DOMParser() |
|
const doc = parser.parseFromString(xmlText, 'text/xml') |
|
|
|
// Check for parsing errors |
|
const parserError = doc.querySelector('parsererror') |
|
if (parserError) { |
|
throw new Error('Failed to parse XML feed') |
|
} |
|
|
|
// Determine if it's RSS or Atom |
|
const isAtom = doc.documentElement.tagName === 'feed' || doc.documentElement.namespaceURI === 'http://www.w3.org/2005/Atom' |
|
|
|
if (isAtom) { |
|
return this.parseAtomFeed(doc, feedUrl) |
|
} else { |
|
return this.parseRssFeed(doc, feedUrl) |
|
} |
|
} |
|
|
|
/** |
|
* Parse RSS 2.0 feed |
|
*/ |
|
private parseRssFeed(doc: Document, feedUrl: string): RssFeed { |
|
const channel = doc.querySelector('channel') |
|
if (!channel) { |
|
throw new Error('Invalid RSS feed: no channel element found') |
|
} |
|
|
|
const title = this.getTextContent(channel, 'title') || 'Untitled Feed' |
|
const link = this.getTextContent(channel, 'link') || feedUrl |
|
const description = this.getTextContent(channel, 'description') || '' |
|
|
|
// Extract feed metadata |
|
const language = this.getTextContent(channel, 'language') || undefined |
|
const copyright = this.getTextContent(channel, 'copyright') || undefined |
|
const generator = this.getTextContent(channel, 'generator') || undefined |
|
const lastBuildDateStr = this.getTextContent(channel, 'lastBuildDate') |
|
const lastBuildDate = lastBuildDateStr ? (this.parseDate(lastBuildDateStr) || undefined) : undefined |
|
|
|
// Extract feed image |
|
// Check all channel children for image elements (both standard RSS and namespaced) |
|
let feedImage: RssFeed['image'] | undefined |
|
const allChannelChildren = Array.from(channel.children) |
|
|
|
// First, try to find standard RSS 2.0 <image> element |
|
const standardImageElements = allChannelChildren.filter(child => { |
|
const nodeName = child.nodeName.toLowerCase() |
|
const localName = child.localName || nodeName |
|
const namespaceURI = child.namespaceURI |
|
// Standard RSS image element has nodeName "image" with no namespace prefix |
|
return localName === 'image' && |
|
!nodeName.includes(':') && |
|
(!namespaceURI || (!namespaceURI.includes('itunes') && !namespaceURI.includes('media'))) |
|
}) |
|
|
|
if (standardImageElements.length > 0) { |
|
const imageElement = standardImageElements[0] |
|
logger.debug('[RssFeedService] Processing standard image element', { |
|
url: feedUrl, |
|
nodeName: imageElement.nodeName, |
|
localName: imageElement.localName, |
|
childrenCount: imageElement.children.length, |
|
innerHTML: imageElement.innerHTML?.substring(0, 200) |
|
}) |
|
|
|
const imageUrl = this.getTextContent(imageElement, 'url') |
|
logger.debug('[RssFeedService] Extracted image URL', { url: feedUrl, imageUrl }) |
|
|
|
if (imageUrl) { |
|
const imageTitle = this.getTextContent(imageElement, 'title') |
|
const imageLink = this.getTextContent(imageElement, 'link') |
|
const imageWidth = this.getTextContent(imageElement, 'width') |
|
const imageHeight = this.getTextContent(imageElement, 'height') |
|
const imageDescription = this.getTextContent(imageElement, 'description') |
|
|
|
feedImage = { |
|
url: imageUrl, |
|
title: imageTitle || undefined, |
|
link: imageLink || undefined, |
|
width: imageWidth || undefined, |
|
height: imageHeight || undefined, |
|
description: imageDescription || undefined |
|
} |
|
logger.debug('[RssFeedService] Found standard RSS feed image element', { url: feedUrl, imageUrl, feedImage }) |
|
} else { |
|
logger.warn('[RssFeedService] Standard image element found but no URL extracted', { |
|
url: feedUrl, |
|
imageElementHTML: imageElement.outerHTML?.substring(0, 300) |
|
}) |
|
} |
|
} |
|
|
|
// If no standard image found, check for itunes:image (common in podcast feeds) |
|
if (!feedImage) { |
|
const itunesImageElements = allChannelChildren.filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
const namespaceURI = child.namespaceURI |
|
// Check if it's itunes:image by namespace or nodeName |
|
return (localName === 'image' && namespaceURI && namespaceURI.includes('itunes')) || |
|
nodeName === 'itunes:image' || |
|
(nodeName.includes('itunes') && nodeName.includes('image')) |
|
}) |
|
|
|
if (itunesImageElements.length > 0) { |
|
const itunesImage = itunesImageElements[0] |
|
// itunes:image uses href attribute, not nested url element |
|
const href = itunesImage.getAttribute('href') |
|
if (href) { |
|
feedImage = { url: href } |
|
logger.debug('[RssFeedService] Found itunes:image', { url: feedUrl, imageUrl: href }) |
|
} |
|
} |
|
} |
|
|
|
logger.debug('[RssFeedService] Feed image extraction result', { |
|
url: feedUrl, |
|
hasImage: !!feedImage, |
|
imageUrl: feedImage?.url, |
|
channelChildrenCount: allChannelChildren.length, |
|
standardImageCount: standardImageElements.length |
|
}) |
|
|
|
const items: RssFeedItem[] = [] |
|
const itemElements = channel.querySelectorAll('item') |
|
|
|
itemElements.forEach((item) => { |
|
const itemTitle = this.getTextContent(item, 'title') || '' |
|
let itemLink = this.getTextContent(item, 'link') || '' |
|
// Convert relative URLs to absolute |
|
if (itemLink && !itemLink.startsWith('http://') && !itemLink.startsWith('https://')) { |
|
try { |
|
const baseUrl = new URL(feedUrl) |
|
itemLink = new URL(itemLink, baseUrl.origin).href |
|
} catch { |
|
// If URL parsing fails, keep the original link |
|
} |
|
} |
|
if (itemLink) { |
|
const cleanedLink = cleanUrl(itemLink) |
|
if (cleanedLink) itemLink = cleanedLink |
|
} |
|
// For description, prefer content:encoded (WordPress full content) over description (truncated) |
|
// Check for content:encoded first, then fall back to description |
|
let itemDescription = '' |
|
|
|
// Try to find content:encoded element (WordPress namespace extension) |
|
// Iterate through all direct children to find it (most reliable method for namespaced XML) |
|
const children = Array.from(item.children) |
|
let contentEncoded: Element | null = null |
|
|
|
for (const child of children) { |
|
// Check if this is the content:encoded element |
|
// The tagName might be "content:encoded" or just "encoded" depending on parser |
|
const tagName = child.tagName || child.nodeName |
|
if (tagName && ( |
|
tagName.toLowerCase() === 'encoded' || |
|
tagName.toLowerCase() === 'content:encoded' || |
|
tagName.includes('encoded') || |
|
(child.localName && child.localName.toLowerCase() === 'encoded') |
|
)) { |
|
contentEncoded = child |
|
break |
|
} |
|
} |
|
|
|
if (contentEncoded) { |
|
// For CDATA sections in XML, we need to get the content carefully |
|
// The content:encoded element contains CDATA with HTML |
|
|
|
// Get textContent first (this properly extracts CDATA content) |
|
// textContent will contain the HTML as a string from CDATA sections |
|
const rawContent = contentEncoded.textContent?.trim() || contentEncoded.innerHTML?.trim() || '' |
|
|
|
if (rawContent) { |
|
// Clean up the content - remove any XML artifacts that might have leaked through |
|
// Remove XML closing tags that might appear at the end (like ]]>) |
|
itemDescription = rawContent |
|
.replace(/\]\]\s*>\s*$/g, '') // Remove trailing ]]> from CDATA |
|
.replace(/^\s*<!\[CDATA\[/g, '') // Remove leading CDATA declaration |
|
.trim() |
|
|
|
// If the content looks like it has HTML tags, use it as-is |
|
// Otherwise, it might be plain text that needs HTML entity decoding |
|
if (itemDescription && itemDescription.includes('<')) { |
|
// It's HTML - ensure it's clean |
|
// Remove any stray XML/namespace declarations that might appear |
|
itemDescription = itemDescription |
|
.replace(/<\?xml[^>]*\?>/gi, '') // Remove XML declarations |
|
.replace(/<\!DOCTYPE[^>]*>/gi, '') // Remove DOCTYPE declarations |
|
.trim() |
|
} |
|
} |
|
|
|
// Log for debugging |
|
if (itemDescription) { |
|
logger.debug('[RssFeedService] Found content:encoded', { |
|
url: feedUrl, |
|
hasHtml: itemDescription.includes('<'), |
|
length: itemDescription.length, |
|
preview: itemDescription.substring(0, 100) |
|
}) |
|
} |
|
} else { |
|
logger.debug('[RssFeedService] content:encoded not found, using description', { url: feedUrl }) |
|
} |
|
|
|
// Fall back to description if content:encoded is not found or empty |
|
if (!itemDescription) { |
|
// Try getting HTML content from description tag |
|
itemDescription = this.getHtmlContent(item, 'description') || '' |
|
|
|
// If that doesn't work, try getting text content and decode HTML entities |
|
// This handles cases where HTML entities are in the text content |
|
if (!itemDescription) { |
|
const descElement = item.querySelector('description') |
|
if (descElement) { |
|
// Get raw text content (which may contain HTML entities) |
|
const rawText = descElement.textContent?.trim() || descElement.innerHTML?.trim() || '' |
|
if (rawText) { |
|
// Decode HTML entities using a temporary element |
|
// The browser will automatically decode entities when setting innerHTML |
|
const temp = document.createElement('textarea') |
|
temp.innerHTML = rawText |
|
itemDescription = temp.value |
|
} |
|
} |
|
} |
|
|
|
// Clean description as well |
|
if (itemDescription) { |
|
itemDescription = itemDescription |
|
.replace(/\]\]\s*>\s*$/g, '') |
|
.replace(/^\s*<!\[CDATA\[/g, '') |
|
.trim() |
|
} |
|
} |
|
const pubDateText = this.getTextContent(item, 'pubDate') |
|
const itemPubDate = this.parseDate(pubDateText) |
|
let itemGuid = this.getTextContent(item, 'guid') || itemLink || '' |
|
if (itemGuid && (itemGuid.startsWith('http://') || itemGuid.startsWith('https://'))) { |
|
const cleanedGuid = cleanUrl(itemGuid) |
|
if (cleanedGuid) itemGuid = cleanedGuid |
|
} |
|
|
|
// Log item parsing for debugging |
|
if (!itemPubDate && pubDateText) { |
|
logger.warn('[RssFeedService] Failed to parse pubDate for item', { |
|
url: feedUrl, |
|
title: itemTitle.substring(0, 50), |
|
pubDateText, |
|
link: itemLink |
|
}) |
|
} |
|
|
|
// Extract enclosure element (for audio/video files) |
|
let enclosure: RssFeedItemEnclosure | undefined |
|
const enclosureElement = item.querySelector('enclosure') |
|
if (enclosureElement) { |
|
const enclosureUrl = enclosureElement.getAttribute('url') || '' |
|
const enclosureType = enclosureElement.getAttribute('type') || '' |
|
const enclosureLength = enclosureElement.getAttribute('length') || undefined |
|
|
|
if (enclosureUrl && enclosureType) { |
|
// Try to get duration from itunes:duration |
|
let duration: string | undefined |
|
const allItemChildren = Array.from(item.children) |
|
const durationElements = allItemChildren.filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
const namespaceURI = child.namespaceURI |
|
return (localName === 'duration' && (nodeName.includes('itunes:duration') || namespaceURI?.includes('itunes'))) || |
|
nodeName === 'itunes:duration' |
|
}) |
|
|
|
if (durationElements.length > 0) { |
|
duration = durationElements[0].textContent?.trim() || undefined |
|
} |
|
|
|
enclosure = { |
|
url: enclosureUrl, |
|
type: enclosureType, |
|
length: enclosureLength, |
|
duration: duration |
|
} |
|
|
|
logger.debug('[RssFeedService] Found enclosure', { |
|
url: feedUrl, |
|
itemTitle: itemTitle.substring(0, 50), |
|
enclosureType: enclosureType, |
|
enclosureUrl: enclosureUrl, |
|
duration: duration |
|
}) |
|
} |
|
} |
|
|
|
// Extract media:content elements (Media RSS) |
|
// Handle namespaced elements by checking all elements and filtering by localName and namespace |
|
const media: RssFeedItemMedia[] = [] |
|
|
|
// Get all child elements and filter for media:content |
|
// media:content has localName "content" but is in the media namespace |
|
// Regular RSS content:encoded has localName "encoded" and is in the content namespace (different!) |
|
const allChildren = Array.from(item.children) |
|
const mediaContentElements = allChildren.filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
const namespaceURI = child.namespaceURI |
|
|
|
// media:content elements have: |
|
// 1. localName "content" AND a "url" attribute (media:content has url attribute) |
|
// 2. nodeName includes "media:content" |
|
// 3. namespaceURI includes "media" |
|
// We exclude content:encoded which has localName "encoded" (not "content") |
|
if (localName === 'content') { |
|
// If it has a url attribute, it's likely media:content (content:encoded doesn't have url) |
|
if (child.getAttribute('url')) { |
|
return true |
|
} |
|
// Check namespace - media:content is in media namespace |
|
if (namespaceURI && namespaceURI.includes('media')) { |
|
return true |
|
} |
|
// Check nodeName for media: prefix |
|
if (nodeName.includes('media:content') || nodeName.startsWith('media:')) { |
|
return true |
|
} |
|
} |
|
return false |
|
}) |
|
|
|
logger.debug('[RssFeedService] Found media:content elements', { |
|
url: feedUrl, |
|
itemTitle: itemTitle.substring(0, 50), |
|
mediaCount: mediaContentElements.length, |
|
allChildrenCount: allChildren.length |
|
}) |
|
|
|
mediaContentElements.forEach((mediaEl) => { |
|
const url = mediaEl.getAttribute('url') || '' |
|
const type = mediaEl.getAttribute('type') || undefined |
|
const width = mediaEl.getAttribute('width') || undefined |
|
const height = mediaEl.getAttribute('height') || undefined |
|
|
|
if (url) { |
|
// Get media:credit (attribution) - check children for credit element |
|
let credit: string | undefined |
|
const creditElements = Array.from(mediaEl.children).filter(child => { |
|
const localName = child.localName || child.nodeName |
|
return localName === 'credit' || child.nodeName === 'media:credit' |
|
}) |
|
if (creditElements.length > 0) { |
|
credit = creditElements[0].textContent?.trim() || creditElements[0].getAttribute('scheme') || undefined |
|
} |
|
|
|
// Get media:thumbnail - check children for thumbnail element |
|
let thumbnail: string | undefined |
|
const thumbnailElements = Array.from(mediaEl.children).filter(child => { |
|
const localName = child.localName || child.nodeName |
|
return localName === 'thumbnail' || child.nodeName === 'media:thumbnail' |
|
}) |
|
if (thumbnailElements.length > 0) { |
|
thumbnail = thumbnailElements[0].getAttribute('url') || undefined |
|
} |
|
|
|
media.push({ |
|
url, |
|
type, |
|
credit, |
|
thumbnail, |
|
width, |
|
height |
|
}) |
|
} |
|
}) |
|
|
|
// Also check for media:thumbnail at item level (if no media:content found) |
|
if (media.length === 0) { |
|
const thumbnailElementsAtItemLevel = Array.from(item.children).filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
return (localName === 'thumbnail' && (nodeName.includes('media:thumbnail') || child.namespaceURI?.includes('media'))) || |
|
nodeName === 'media:thumbnail' |
|
}) |
|
|
|
thumbnailElementsAtItemLevel.forEach((thumbEl) => { |
|
const url = thumbEl.getAttribute('url') || '' |
|
if (url) { |
|
media.push({ |
|
url, |
|
type: 'image', |
|
thumbnail: url |
|
}) |
|
} |
|
}) |
|
} |
|
|
|
items.push({ |
|
title: itemTitle, |
|
link: itemLink, |
|
description: itemDescription, |
|
pubDate: itemPubDate, |
|
guid: itemGuid, |
|
feedUrl, |
|
feedTitle: title, |
|
feedImage: feedImage?.url, |
|
feedDescription: description, |
|
media: media.length > 0 ? media : undefined, |
|
enclosure: enclosure || undefined |
|
}) |
|
}) |
|
|
|
return { |
|
title, |
|
link, |
|
description, |
|
items, |
|
feedUrl, |
|
image: feedImage, |
|
language, |
|
copyright, |
|
generator, |
|
lastBuildDate |
|
} |
|
} |
|
|
|
/** |
|
* Parse Atom 1.0 feed |
|
*/ |
|
private parseAtomFeed(doc: Document, feedUrl: string): RssFeed { |
|
const feed = doc.documentElement |
|
|
|
const title = this.getTextContent(feed, 'title') || 'Untitled Feed' |
|
const linkElement = feed.querySelector('link[rel="alternate"], link:not([rel])') |
|
const link = linkElement?.getAttribute('href') || feedUrl |
|
const description = this.getTextContent(feed, 'subtitle') || this.getTextContent(feed, 'description') || '' |
|
|
|
// Extract feed metadata for Atom feeds |
|
const language = feed.getAttribute('xml:lang') || undefined |
|
const rights = this.getTextContent(feed, 'rights') || undefined |
|
const generator = this.getTextContent(feed, 'generator') || undefined |
|
const updatedStr = this.getTextContent(feed, 'updated') |
|
const lastBuildDate = updatedStr ? (this.parseDate(updatedStr) || undefined) : undefined |
|
|
|
// Extract feed image/logo for Atom feeds |
|
let feedImage: RssFeed['image'] | undefined |
|
const logoElement = feed.querySelector('logo') |
|
const iconElement = feed.querySelector('icon') |
|
if (logoElement) { |
|
const logoUrl = this.getTextContent(feed, 'logo') |
|
if (logoUrl) { |
|
feedImage = { url: logoUrl } |
|
} |
|
} else if (iconElement) { |
|
const iconUrl = this.getTextContent(feed, 'icon') |
|
if (iconUrl) { |
|
feedImage = { url: iconUrl } |
|
} |
|
} |
|
|
|
const items: RssFeedItem[] = [] |
|
const entryElements = feed.querySelectorAll('entry') |
|
|
|
entryElements.forEach((entry) => { |
|
const entryTitle = this.getTextContent(entry, 'title') || '' |
|
const entryLinkElement = entry.querySelector('link[rel="alternate"], link:not([rel])') |
|
let entryLink = entryLinkElement?.getAttribute('href') || '' |
|
// Convert relative URLs to absolute |
|
if (entryLink && !entryLink.startsWith('http://') && !entryLink.startsWith('https://')) { |
|
try { |
|
const baseUrl = new URL(feedUrl) |
|
entryLink = new URL(entryLink, baseUrl.origin).href |
|
} catch { |
|
// If URL parsing fails, keep the original link |
|
} |
|
} |
|
if (entryLink) { |
|
const cleanedEntryLink = cleanUrl(entryLink) |
|
if (cleanedEntryLink) entryLink = cleanedEntryLink |
|
} |
|
// For content/summary, preserve HTML content |
|
let entryContent = this.getHtmlContent(entry, 'content') || this.getHtmlContent(entry, 'summary') || '' |
|
// Additional cleaning for Atom feeds (getHtmlContent already does basic cleaning) |
|
// This ensures any remaining XML artifacts are removed |
|
if (entryContent) { |
|
entryContent = entryContent |
|
.replace(/\]\]\s*>\s*$/gm, '') |
|
.replace(/^\s*<!\[CDATA\[/gm, '') |
|
.trim() |
|
} |
|
const entryPublished = this.getTextContent(entry, 'published') || this.getTextContent(entry, 'updated') |
|
const entryPubDate = this.parseDate(entryPublished) |
|
let entryId = this.getTextContent(entry, 'id') || entryLink || '' |
|
if (entryId && (entryId.startsWith('http://') || entryId.startsWith('https://'))) { |
|
const cleanedId = cleanUrl(entryId) |
|
if (cleanedId) entryId = cleanedId |
|
} |
|
|
|
// Extract enclosure/link elements for Atom feeds (Atom uses <link rel="enclosure">) |
|
let enclosure: RssFeedItemEnclosure | undefined |
|
const enclosureLinkElements = entry.querySelectorAll('link[rel="enclosure"]') |
|
if (enclosureLinkElements.length > 0) { |
|
const enclosureLink = enclosureLinkElements[0] |
|
const enclosureUrl = enclosureLink.getAttribute('href') || '' |
|
const enclosureType = enclosureLink.getAttribute('type') || '' |
|
const enclosureLength = enclosureLink.getAttribute('length') || undefined |
|
|
|
if (enclosureUrl && enclosureType) { |
|
// Try to get duration from itunes:duration |
|
let duration: string | undefined |
|
const allEntryChildren = Array.from(entry.children) |
|
const durationElements = allEntryChildren.filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
const namespaceURI = child.namespaceURI |
|
return (localName === 'duration' && (nodeName.includes('itunes:duration') || namespaceURI?.includes('itunes'))) || |
|
nodeName === 'itunes:duration' |
|
}) |
|
|
|
if (durationElements.length > 0) { |
|
duration = durationElements[0].textContent?.trim() || undefined |
|
} |
|
|
|
enclosure = { |
|
url: enclosureUrl, |
|
type: enclosureType, |
|
length: enclosureLength, |
|
duration: duration |
|
} |
|
} |
|
} |
|
|
|
// Extract media:content elements (Media RSS) for Atom feeds |
|
// In Atom feeds, we need to distinguish between media:content (media) and content (entry content) |
|
// Handle namespaced elements by checking all elements and filtering by namespace |
|
const media: RssFeedItemMedia[] = [] |
|
|
|
// Get all child elements and filter for media:content |
|
// media:content has localName "content" but is in the media namespace (not Atom namespace) |
|
const allChildren = Array.from(entry.children) |
|
const mediaContentElements = allChildren.filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
const namespaceURI = child.namespaceURI |
|
// Check if it's media:content - must have localName "content" but NOT be in Atom namespace |
|
// Atom content element is in Atom namespace, media:content is in media namespace |
|
if (localName === 'content') { |
|
// If it has a url attribute, it's likely media:content (Atom content uses src or type="xhtml") |
|
if (child.getAttribute('url')) { |
|
return true |
|
} |
|
// Check namespace - media:content is in media namespace, not Atom namespace |
|
if (namespaceURI && namespaceURI.includes('media') && !namespaceURI.includes('atom')) { |
|
return true |
|
} |
|
// Check nodeName for media: prefix |
|
if (nodeName.includes('media:content')) { |
|
return true |
|
} |
|
} |
|
return false |
|
}) |
|
|
|
mediaContentElements.forEach((mediaEl) => { |
|
const url = mediaEl.getAttribute('url') || '' |
|
const type = mediaEl.getAttribute('type') || undefined |
|
const width = mediaEl.getAttribute('width') || undefined |
|
const height = mediaEl.getAttribute('height') || undefined |
|
|
|
if (url) { |
|
// Get media:credit (attribution) - check children for credit element |
|
let credit: string | undefined |
|
const creditElements = Array.from(mediaEl.children).filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
return (localName === 'credit' && (nodeName.includes('media:credit') || child.namespaceURI?.includes('media'))) || |
|
nodeName === 'media:credit' |
|
}) |
|
if (creditElements.length > 0) { |
|
credit = creditElements[0].textContent?.trim() || creditElements[0].getAttribute('scheme') || undefined |
|
} |
|
|
|
// Get media:thumbnail - check children for thumbnail element |
|
let thumbnail: string | undefined |
|
const thumbnailElements = Array.from(mediaEl.children).filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
return (localName === 'thumbnail' && (nodeName.includes('media:thumbnail') || child.namespaceURI?.includes('media'))) || |
|
nodeName === 'media:thumbnail' |
|
}) |
|
if (thumbnailElements.length > 0) { |
|
thumbnail = thumbnailElements[0].getAttribute('url') || undefined |
|
} |
|
|
|
media.push({ |
|
url, |
|
type, |
|
credit, |
|
thumbnail, |
|
width, |
|
height |
|
}) |
|
} |
|
}) |
|
|
|
// Also check for media:thumbnail at entry level (if no media:content found) |
|
if (media.length === 0) { |
|
const thumbnailElementsAtEntryLevel = Array.from(entry.children).filter(child => { |
|
const localName = child.localName || child.nodeName.toLowerCase() |
|
const nodeName = child.nodeName.toLowerCase() |
|
return (localName === 'thumbnail' && (nodeName.includes('media:thumbnail') || child.namespaceURI?.includes('media'))) || |
|
nodeName === 'media:thumbnail' |
|
}) |
|
|
|
thumbnailElementsAtEntryLevel.forEach((thumbEl) => { |
|
const url = thumbEl.getAttribute('url') || '' |
|
if (url) { |
|
media.push({ |
|
url, |
|
type: 'image', |
|
thumbnail: url |
|
}) |
|
} |
|
}) |
|
} |
|
|
|
items.push({ |
|
title: entryTitle, |
|
link: entryLink, |
|
description: entryContent, |
|
pubDate: entryPubDate, |
|
guid: entryId, |
|
feedUrl, |
|
feedTitle: title, |
|
feedImage: feedImage?.url, |
|
feedDescription: description, |
|
media: media.length > 0 ? media : undefined, |
|
enclosure: enclosure |
|
}) |
|
}) |
|
|
|
return { |
|
title, |
|
link, |
|
description, |
|
items, |
|
feedUrl, |
|
image: feedImage, |
|
language, |
|
copyright: rights, |
|
generator, |
|
lastBuildDate |
|
} |
|
} |
|
|
|
/** |
|
* Get text content from an element, handling CDATA and nested elements |
|
*/ |
|
private getTextContent(element: Element | null, tagName: string): string { |
|
if (!element) return '' |
|
const child = element.querySelector(tagName) |
|
if (!child) return '' |
|
// Get text content which automatically decodes HTML entities |
|
return child.textContent?.trim() || '' |
|
} |
|
|
|
/** |
|
* Get HTML content from an element (for descriptions that may contain HTML) |
|
*/ |
|
private getHtmlContent(element: Element | null, tagName: string): string { |
|
if (!element) return '' |
|
// Handle namespaced tags like content:encoded |
|
const child = element.querySelector(tagName) || |
|
element.querySelector(tagName.replace(':', '\\:')) || |
|
element.getElementsByTagName(tagName)[0] |
|
if (!child) return '' |
|
|
|
// Get innerHTML to preserve HTML formatting and CDATA content |
|
// CDATA sections are automatically included in innerHTML/textContent |
|
let html = child.innerHTML?.trim() || child.textContent?.trim() || '' |
|
|
|
if (!html) return '' |
|
|
|
// Decode HTML entities that might be encoded (like < > & etc.) |
|
// The browser's XML parser should decode entities automatically when accessing textContent/innerHTML |
|
// However, if entities are still present, decode them using textarea trick |
|
// This handles cases where entities are double-encoded or in raw XML text |
|
if (html.includes('<') || html.includes('>') || html.includes('&')) { |
|
// HTML entities are present, decode them |
|
const decoder = document.createElement('textarea') |
|
decoder.innerHTML = html |
|
html = decoder.value |
|
} |
|
|
|
// Also decode numeric entities (like —) using the same method |
|
// The textarea approach handles both named and numeric entities |
|
const temp = document.createElement('textarea') |
|
temp.innerHTML = html |
|
html = temp.value || html |
|
|
|
// Clean up any XML artifacts that might have leaked through |
|
// Do this AFTER entity decoding, as entities might encode XML artifacts |
|
html = html |
|
.replace(/\]\]\s*>\s*$/gm, '') // Remove trailing ]]> from CDATA (multiline, end of string) |
|
.replace(/\]\]\s*>/g, '') // Remove any ]]> anywhere in the content |
|
.replace(/^\s*<!\[CDATA\[/gm, '') // Remove leading CDATA declaration (multiline, start of string) |
|
.replace(/<!\[CDATA\[/g, '') // Remove any CDATA declaration anywhere |
|
.replace(/<\?xml[^>]*\?>/gi, '') // Remove XML declarations |
|
.replace(/<\!DOCTYPE[^>]*>/gi, '') // Remove DOCTYPE declarations |
|
.replace(/xmlns[=:][^=]*=["'][^"']*["']/gi, '') // Remove xmlns attributes |
|
.trim() |
|
|
|
return html |
|
} |
|
|
|
/** |
|
* Build a map of foreign month names to English abbreviations using Intl.DateTimeFormat |
|
* This handles month names in various languages automatically |
|
*/ |
|
private buildMonthMap(): Record<string, string> { |
|
// Common locales that might appear in RSS feeds |
|
const locales = ['de', 'fr', 'es', 'it', 'pt', 'pt-BR', 'ru', 'pl', 'nl', 'sv', 'no', 'da', 'fi', 'cs', 'hu', 'ro', 'sk', 'sl', 'hr', 'bg', 'el', 'tr', 'ja', 'ko', 'zh', 'ar', 'he', 'th', 'vi', 'hi', 'fa'] |
|
|
|
const monthMap: Record<string, string> = {} |
|
const year = new Date().getFullYear() |
|
|
|
// English month abbreviations (0-11 index to English abbrev) |
|
const englishMonths = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
|
|
// Build map for each locale |
|
for (const locale of locales) { |
|
try { |
|
const formatter = new Intl.DateTimeFormat(locale, { month: 'short' }) |
|
for (let monthIndex = 0; monthIndex < 12; monthIndex++) { |
|
const foreignMonth = formatter.format(new Date(year, monthIndex)) |
|
const englishMonth = englishMonths[monthIndex] |
|
|
|
// Add both the full foreign month name and its lowercase version |
|
if (foreignMonth && englishMonth) { |
|
const trimmed = foreignMonth.trim() |
|
// Locales may emit numeric or odd tokens; never map those (would match "12:01:00" as \b12\b). |
|
if (/^\d+$/.test(trimmed)) { |
|
continue |
|
} |
|
monthMap[foreignMonth] = englishMonth |
|
monthMap[foreignMonth.toLowerCase()] = englishMonth |
|
// Also handle common variations (first 3 letters) |
|
if (foreignMonth.length >= 3) { |
|
const abbrev = foreignMonth.substring(0, 3).trim() |
|
if (abbrev.length >= 2 && !/^\d/.test(abbrev)) { |
|
monthMap[abbrev] = englishMonth |
|
monthMap[abbrev.toLowerCase()] = englishMonth |
|
} |
|
} |
|
} |
|
} |
|
} catch { |
|
// Skip locales that fail to format |
|
continue |
|
} |
|
} |
|
|
|
return monthMap |
|
} |
|
|
|
/** |
|
* Parse date string into Date object |
|
* Handles non-standard formats by skipping weekday and mapping foreign month names |
|
*/ |
|
private parseDate(dateString: string | null): Date | null { |
|
if (!dateString) return null |
|
|
|
// First, try standard Date parsing |
|
try { |
|
const standardDate = new Date(dateString) |
|
if (!isNaN(standardDate.getTime())) { |
|
return standardDate |
|
} |
|
} catch { |
|
// Continue to fallback parsing |
|
} |
|
|
|
// Handle non-standard formats (e.g., "Don, 06 Nov 2025 15:24:25") |
|
// Skip the weekday part (everything up to and including the first comma) |
|
let dateToParse = dateString.trim() |
|
const commaIndex = dateToParse.indexOf(',') |
|
if (commaIndex > 0) { |
|
// Skip weekday and comma, keep the rest |
|
dateToParse = dateToParse.substring(commaIndex + 1).trim() |
|
} |
|
|
|
// Build month map using Intl.DateTimeFormat (lazy initialization) |
|
if (!this.monthMapCache) { |
|
this.monthMapCache = this.buildMonthMap() |
|
logger.debug('[RssFeedService] Built month map', { |
|
monthCount: Object.keys(this.monthMapCache).length, |
|
sampleMonths: Object.entries(this.monthMapCache).slice(0, 5) |
|
}) |
|
} |
|
|
|
// Replace foreign month names with English equivalents (longest key first so "September" beats "Sep"; |
|
// skip pure-numeric keys so "12:01:00" is never touched by a spurious "12" → "Dec" map entry). |
|
let monthReplaced = false |
|
const monthEntries = Object.entries(this.monthMapCache) |
|
.filter(([foreign]) => !/^\d+$/.test(foreign.trim())) |
|
.sort((a, b) => b[0].length - a[0].length) |
|
|
|
for (const [foreign, english] of monthEntries) { |
|
const regex = new RegExp(`\\b${this.escapeRegex(foreign)}\\b`, 'i') |
|
if (regex.test(dateToParse)) { |
|
dateToParse = dateToParse.replace(regex, english) |
|
monthReplaced = true |
|
logger.debug('[RssFeedService] Replaced month name', { foreign, english, original: dateString, afterReplace: dateToParse }) |
|
break |
|
} |
|
} |
|
|
|
// If no timezone is specified, assume UTC (common for RSS feeds) |
|
const hasTimezone = /[+-]\d{4}|GMT|UTC|EST|PST|CET|CEST|CST|EDT|PDT$/i.test(dateToParse) |
|
if (!hasTimezone && dateToParse.match(/\d{2}:\d{2}:\d{2}$/)) { |
|
// Add UTC timezone if time is present but no timezone |
|
dateToParse += ' UTC' |
|
} |
|
|
|
try { |
|
const parsedDate = new Date(dateToParse) |
|
if (!isNaN(parsedDate.getTime())) { |
|
logger.debug('[RssFeedService] Successfully parsed date', { |
|
original: dateString, |
|
parsed: dateToParse, |
|
result: parsedDate.toISOString() |
|
}) |
|
return parsedDate |
|
} else { |
|
logger.warn('[RssFeedService] Date parsing resulted in invalid date', { |
|
original: dateString, |
|
parsed: dateToParse, |
|
monthReplaced |
|
}) |
|
} |
|
} catch (error) { |
|
logger.warn('[RssFeedService] Date parsing threw error', { |
|
original: dateString, |
|
parsed: dateToParse, |
|
error: error instanceof Error ? error.message : String(error), |
|
monthReplaced |
|
}) |
|
} |
|
|
|
return null |
|
} |
|
|
|
/** |
|
* Escape special regex characters in a string |
|
*/ |
|
private escapeRegex(str: string): string { |
|
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') |
|
} |
|
|
|
/** |
|
* Get feed URLs to use (from event or default) |
|
* If eventFeedUrls is an empty array, return empty array (user has event but no feeds) |
|
* If eventFeedUrls is null/undefined, return default feeds (no event exists) |
|
*/ |
|
getFeedUrls(eventFeedUrls: string[] | null | undefined): string[] { |
|
// If eventFeedUrls is explicitly an array (even if empty), use it |
|
// This means the user has an event, so respect their choice |
|
if (Array.isArray(eventFeedUrls)) { |
|
return eventFeedUrls |
|
} |
|
// If null/undefined, no event exists - use defaults for demo |
|
return DEFAULT_RSS_FEEDS |
|
} |
|
|
|
/** |
|
* Fetch multiple feeds and merge items |
|
* Cache-first: reads from IndexedDB, displays immediately, then background-refreshes to merge new items |
|
*/ |
|
async fetchMultipleFeeds(feedUrls: string[], signal?: AbortSignal): Promise<RssFeedItem[]> { |
|
if (feedUrls.length === 0) { |
|
return [] |
|
} |
|
|
|
// Check if already aborted |
|
if (signal?.aborted) { |
|
throw new DOMException('The operation was aborted.', 'AbortError') |
|
} |
|
|
|
await this.ensureRssFeedAttemptedKeysLoaded() |
|
|
|
// Step 1: Read from IndexedDB cache first (cache-first strategy) |
|
let cachedItems: RssFeedItem[] = [] |
|
try { |
|
const allCachedItems = await indexedDb.getRssFeedItems() |
|
logger.info('[RssFeedService] Retrieved all cached items from IndexedDB', { |
|
totalCached: allCachedItems.length |
|
}) |
|
|
|
// Filter to only items from the requested feeds |
|
const normalizedRequestedUrls = new Set(feedUrls.map((u) => this.normalizeRssFeedKeyUrl(u))) |
|
|
|
cachedItems = allCachedItems.filter(item => { |
|
const normalizedItemUrl = this.normalizeRssFeedKeyUrl(item.feedUrl) |
|
const matches = normalizedRequestedUrls.has(normalizedItemUrl) |
|
if (!matches && allCachedItems.length > 0 && allCachedItems.length < 10) { |
|
// Only log for small sets to avoid spam |
|
logger.debug('[RssFeedService] Item filtered out (feed URL not in requested list)', { |
|
itemFeedUrl: item.feedUrl, |
|
normalizedItemUrl, |
|
requestedFeeds: feedUrls, |
|
normalizedRequestedUrls: Array.from(normalizedRequestedUrls), |
|
itemGuid: item.guid?.substring(0, 20) |
|
}) |
|
} |
|
return matches |
|
}) |
|
|
|
logger.info('[RssFeedService] Filtered cached items by feed URLs', { |
|
beforeFilter: allCachedItems.length, |
|
afterFilter: cachedItems.length, |
|
requestedFeedCount: feedUrls.length, |
|
uniqueCachedFeedUrls: [...new Set(allCachedItems.map(i => i.feedUrl))], |
|
requestedFeedUrls: feedUrls |
|
}) |
|
|
|
// Convert pubDate back to Date objects (handle both Date objects and timestamps/strings) |
|
cachedItems = cachedItems.map(item => { |
|
let pubDate: Date | null = null |
|
if (item.pubDate) { |
|
if (item.pubDate instanceof Date) { |
|
pubDate = item.pubDate |
|
} else if (typeof item.pubDate === 'number') { |
|
pubDate = new Date(item.pubDate) |
|
} else if (typeof item.pubDate === 'string') { |
|
pubDate = new Date(item.pubDate) |
|
} |
|
} |
|
return { |
|
...item, |
|
pubDate |
|
} |
|
}) |
|
|
|
logger.info('[RssFeedService] Loaded cached items from IndexedDB', { |
|
cachedCount: cachedItems.length, |
|
feedCount: feedUrls.length, |
|
filteredCount: cachedItems.length, |
|
feedUrls: feedUrls |
|
}) |
|
} catch (error) { |
|
logger.warn('[RssFeedService] Failed to load cached items from IndexedDB', { error }) |
|
} |
|
|
|
const cacheWasEmpty = cachedItems.length === 0 |
|
|
|
// Missing = no cached rows for this feed URL and we have not yet completed a fetch pass for it |
|
const cachedFeedUrls = new Set(cachedItems.map((item) => this.normalizeRssFeedKeyUrl(item.feedUrl))) |
|
const missingFeeds = feedUrls.filter( |
|
(url) => |
|
!cachedFeedUrls.has(this.normalizeRssFeedKeyUrl(url)) && |
|
!this.rssFeedAttemptedKeys.has(this.normalizeRssFeedKeyUrl(url)) |
|
) |
|
|
|
if (missingFeeds.length > 0) { |
|
logger.info('[RssFeedService] Some feeds are missing from cache, will fetch them', { |
|
missingFeeds, |
|
cachedFeedUrls: Array.from(cachedFeedUrls), |
|
requestedFeeds: feedUrls |
|
}) |
|
} |
|
|
|
// Step 2: Background refresh — never tied to React's AbortSignal (Strict Mode / HMR / remount would cancel network). |
|
const refreshAc = new AbortController() |
|
const refreshSignal = refreshAc.signal |
|
|
|
const backgroundRefresh = async (): Promise<void> => { |
|
const dedupeKey = this.rssMultiFeedRefreshKey(feedUrls) |
|
const inflight = this.rssMultiFeedRefreshInFlight.get(dedupeKey) |
|
if (inflight) { |
|
await inflight |
|
return |
|
} |
|
|
|
const run = async (): Promise<void> => { |
|
if (refreshSignal.aborted) { |
|
return |
|
} |
|
|
|
logger.info('[RssFeedService] Starting background refresh', { |
|
feedCount: feedUrls.length, |
|
feedUrls, |
|
cacheWasEmpty, |
|
cachedItemCount: cachedItems.length |
|
}) |
|
|
|
if (refreshSignal.aborted) { |
|
logger.warn('[RssFeedService] Background refresh aborted before starting', { |
|
feedCount: feedUrls.length |
|
}) |
|
return |
|
} |
|
|
|
try { |
|
logger.info('[RssFeedService] Starting to fetch feeds', { |
|
feedCount: feedUrls.length, |
|
feedUrls, |
|
signalAborted: refreshSignal.aborted |
|
}) |
|
|
|
const results = await Promise.allSettled( |
|
feedUrls.map((url) => { |
|
if (refreshSignal.aborted) { |
|
logger.warn('[RssFeedService] Signal aborted before fetching feed', { url }) |
|
return Promise.reject(new DOMException('The operation was aborted.', 'AbortError')) |
|
} |
|
logger.debug('[RssFeedService] Fetching feed', { url, signalAborted: refreshSignal.aborted }) |
|
return this.fetchFeed(url, refreshSignal) |
|
}) |
|
) |
|
|
|
if (refreshSignal.aborted) { |
|
logger.warn('[RssFeedService] Signal aborted after fetching feeds', { |
|
feedCount: feedUrls.length |
|
}) |
|
return |
|
} |
|
|
|
const newItems: RssFeedItem[] = [] |
|
let successCount = 0 |
|
let failureCount = 0 |
|
let abortCount = 0 |
|
|
|
results.forEach((result, index) => { |
|
if (result.status === 'fulfilled') { |
|
newItems.push(...result.value.items) |
|
successCount++ |
|
logger.info('[RssFeedService] Successfully fetched feed', { |
|
url: feedUrls[index], |
|
itemCount: result.value.items.length, |
|
feedTitle: result.value.title |
|
}) |
|
} else { |
|
failureCount++ |
|
const error = result.reason |
|
if (error instanceof DOMException && error.name === 'AbortError') { |
|
abortCount++ |
|
logger.warn('[RssFeedService] Feed fetch was aborted', { |
|
url: feedUrls[index], |
|
reason: error.message || 'AbortError' |
|
}) |
|
return |
|
} |
|
const errorMessage = error instanceof Error ? error.message : String(error) |
|
logger.warn('[RssFeedService] Failed to fetch feed after trying all strategies', { |
|
url: feedUrls[index], |
|
error: errorMessage, |
|
errorStack: error instanceof Error ? error.stack : undefined, |
|
errorType: error?.constructor?.name |
|
}) |
|
} |
|
}) |
|
|
|
logger.info('[RssFeedService] Background refresh completed', { |
|
successCount, |
|
failureCount, |
|
abortCount, |
|
newItemCount: newItems.length, |
|
totalFeeds: feedUrls.length |
|
}) |
|
|
|
if (!refreshSignal.aborted) { |
|
this.markFeedKeysAttempted(feedUrls) |
|
await this.persistRssFeedAttemptedKeys() |
|
} |
|
|
|
if (!refreshSignal.aborted && successCount > 0) { |
|
const itemMap = new Map<string, RssFeedItem>() |
|
|
|
cachedItems.forEach((item) => { |
|
const key = `${item.feedUrl}:${item.guid}` |
|
itemMap.set(key, item) |
|
}) |
|
|
|
newItems.forEach((item) => { |
|
const key = `${item.feedUrl}:${item.guid}` |
|
const existing = itemMap.get(key) |
|
if (!existing || (item.pubDate && existing.pubDate && item.pubDate > existing.pubDate)) { |
|
itemMap.set(key, item) |
|
} |
|
}) |
|
|
|
const mergedItems = Array.from(itemMap.values()) |
|
|
|
mergedItems.sort((a, b) => { |
|
const dateA = a.pubDate?.getTime() || 0 |
|
const dateB = b.pubDate?.getTime() || 0 |
|
return dateB - dateA |
|
}) |
|
|
|
try { |
|
await this.persistGlobalRssCacheAfterMerge(mergedItems, feedUrls) |
|
logger.info('[RssFeedService] Updated IndexedDB cache with merged items', { |
|
mergedFromThisRefresh: mergedItems.length, |
|
newItems: newItems.length, |
|
cachedItems: cachedItems.length |
|
}) |
|
} catch (error) { |
|
logger.error('[RssFeedService] Failed to update IndexedDB cache', { error }) |
|
} |
|
} |
|
} catch (error) { |
|
if (!(error instanceof DOMException && error.name === 'AbortError')) { |
|
logger.error('[RssFeedService] Background refresh failed', { error }) |
|
} |
|
} |
|
} |
|
|
|
const p = run() |
|
this.rssMultiFeedRefreshInFlight.set(dedupeKey, p) |
|
try { |
|
await p |
|
} finally { |
|
if (this.rssMultiFeedRefreshInFlight.get(dedupeKey) === p) { |
|
this.rssMultiFeedRefreshInFlight.delete(dedupeKey) |
|
} |
|
} |
|
} |
|
|
|
// Wait only while some requested feeds are still unknown (no cache rows and no completed fetch pass) |
|
const shouldWaitForRefresh = missingFeeds.length > 0 |
|
|
|
if (shouldWaitForRefresh) { |
|
logger.info('[RssFeedService] Waiting for background refresh to complete', { |
|
feedCount: feedUrls.length, |
|
cacheWasEmpty, |
|
missingFeedsCount: missingFeeds.length, |
|
missingFeeds |
|
}) |
|
try { |
|
const callerGone = signal |
|
? new Promise<void>((resolve) => { |
|
if (signal.aborted) resolve() |
|
else signal.addEventListener('abort', () => resolve(), { once: true }) |
|
}) |
|
: new Promise<void>(() => { |
|
/* never */ |
|
}) |
|
|
|
// Caller abort ends the wait early; refresh keeps running on refreshAc.signal |
|
await Promise.race([ |
|
backgroundRefresh(), |
|
new Promise<void>((resolve) => setTimeout(() => resolve(), 30000)), |
|
callerGone |
|
]) |
|
|
|
// Re-read from cache after background refresh |
|
try { |
|
const refreshedItems = await indexedDb.getRssFeedItems() |
|
const feedUrlSet = new Set(feedUrls.map((u) => this.normalizeRssFeedKeyUrl(u))) |
|
cachedItems = refreshedItems |
|
.filter((item) => feedUrlSet.has(this.normalizeRssFeedKeyUrl(item.feedUrl))) |
|
.map(item => ({ |
|
...item, |
|
pubDate: item.pubDate ? new Date(item.pubDate) : null |
|
})) |
|
|
|
logger.info('[RssFeedService] Loaded items after background refresh', { |
|
itemCount: cachedItems.length, |
|
feedCount: feedUrls.length |
|
}) |
|
} catch (error) { |
|
logger.warn('[RssFeedService] Failed to reload cached items after background refresh', { error }) |
|
} |
|
} catch (error) { |
|
if (!(error instanceof DOMException && error.name === 'AbortError')) { |
|
logger.error('[RssFeedService] Background refresh error during initial load', { error }) |
|
} |
|
} |
|
} else { |
|
// Cache has all requested feeds, start background refresh in background (don't wait) |
|
logger.debug('[RssFeedService] All feeds in cache, starting background refresh without waiting') |
|
void backgroundRefresh().catch((err) => { |
|
if (!(err instanceof DOMException && err.name === 'AbortError')) { |
|
logger.error('[RssFeedService] Background refresh error', { error: err }) |
|
} |
|
}) |
|
} |
|
|
|
// Return cached items (now potentially updated from background refresh) |
|
// Sort by publication date (newest first) |
|
cachedItems.sort((a, b) => { |
|
const dateA = a.pubDate?.getTime() || 0 |
|
const dateB = b.pubDate?.getTime() || 0 |
|
return dateB - dateA |
|
}) |
|
|
|
return cachedItems |
|
} |
|
|
|
/** |
|
* Trigger a background refresh for specific feed URLs (without returning cached items) |
|
* This is useful when you want to force a refresh after updating the feed list |
|
* Aborts any existing background refresh before starting a new one |
|
*/ |
|
async backgroundRefreshFeeds(feedUrls: string[], signal?: AbortSignal): Promise<void> { |
|
if (feedUrls.length === 0) { |
|
return |
|
} |
|
|
|
await this.ensureRssFeedAttemptedKeysLoaded() |
|
for (const u of feedUrls) { |
|
this.rssFeedAttemptedKeys.delete(this.normalizeRssFeedKeyUrl(u)) |
|
} |
|
await this.persistRssFeedAttemptedKeys() |
|
|
|
// Abort any existing background refresh |
|
if (this.backgroundRefreshController) { |
|
logger.info('[RssFeedService] Aborting existing background refresh before starting new one') |
|
this.backgroundRefreshController.abort() |
|
this.backgroundRefreshController = null |
|
} |
|
this.rssMultiFeedRefreshInFlight.clear() |
|
|
|
// Create a new AbortController for this refresh |
|
const controller = new AbortController() |
|
this.backgroundRefreshController = controller |
|
|
|
// Combine with external signal if provided |
|
if (signal) { |
|
if (signal.aborted) { |
|
controller.abort() |
|
this.backgroundRefreshController = null |
|
return |
|
} |
|
signal.addEventListener('abort', () => { |
|
controller.abort() |
|
this.backgroundRefreshController = null |
|
}, { once: true }) |
|
} |
|
|
|
const combinedSignal = signal ? (() => { |
|
const combined = new AbortController() |
|
const abort = () => combined.abort() |
|
signal.addEventListener('abort', abort, { once: true }) |
|
controller.signal.addEventListener('abort', abort, { once: true }) |
|
return combined.signal |
|
})() : controller.signal |
|
|
|
try { |
|
const results = await Promise.allSettled( |
|
feedUrls.map(url => this.fetchFeed(url, combinedSignal)) |
|
) |
|
|
|
if (combinedSignal.aborted || controller.signal.aborted) { |
|
this.backgroundRefreshController = null |
|
return |
|
} |
|
|
|
const newItems: RssFeedItem[] = [] |
|
let successCount = 0 |
|
|
|
results.forEach((result, index) => { |
|
if (result.status === 'fulfilled') { |
|
newItems.push(...result.value.items) |
|
successCount++ |
|
logger.debug('[RssFeedService] Background refresh: successfully fetched feed', { |
|
url: feedUrls[index], |
|
itemCount: result.value.items.length |
|
}) |
|
} |
|
}) |
|
|
|
if (!combinedSignal.aborted && !controller.signal.aborted) { |
|
this.markFeedKeysAttempted(feedUrls) |
|
await this.persistRssFeedAttemptedKeys() |
|
} |
|
|
|
if (!combinedSignal.aborted && !controller.signal.aborted && successCount > 0) { |
|
// Get existing cached items |
|
let cachedItems: RssFeedItem[] = [] |
|
try { |
|
cachedItems = await indexedDb.getRssFeedItems() |
|
const feedUrlSet = new Set(feedUrls) |
|
cachedItems = cachedItems.filter(item => feedUrlSet.has(item.feedUrl)) |
|
cachedItems = cachedItems.map(item => ({ |
|
...item, |
|
pubDate: item.pubDate ? new Date(item.pubDate) : null |
|
})) |
|
} catch (error) { |
|
logger.warn('[RssFeedService] Failed to load cached items for background refresh', { error }) |
|
} |
|
|
|
// Merge new items with cached items (deduplicate by feedUrl:guid) |
|
const itemMap = new Map<string, RssFeedItem>() |
|
|
|
// Add cached items first |
|
cachedItems.forEach(item => { |
|
const key = `${item.feedUrl}:${item.guid}` |
|
itemMap.set(key, item) |
|
}) |
|
|
|
// Add/update with new items (newer items replace older ones) |
|
newItems.forEach(item => { |
|
const key = `${item.feedUrl}:${item.guid}` |
|
const existing = itemMap.get(key) |
|
if (!existing || (item.pubDate && existing.pubDate && item.pubDate > existing.pubDate)) { |
|
itemMap.set(key, item) |
|
} |
|
}) |
|
|
|
const mergedItems = Array.from(itemMap.values()) |
|
|
|
mergedItems.sort((a, b) => { |
|
const dateA = a.pubDate?.getTime() || 0 |
|
const dateB = b.pubDate?.getTime() || 0 |
|
return dateB - dateA |
|
}) |
|
|
|
try { |
|
await this.persistGlobalRssCacheAfterMerge(mergedItems, feedUrls) |
|
logger.info('[RssFeedService] Background refresh: updated IndexedDB cache', { |
|
mergedFromThisRefresh: mergedItems.length, |
|
newItems: newItems.length, |
|
cachedItems: cachedItems.length |
|
}) |
|
} catch (error) { |
|
logger.error('[RssFeedService] Background refresh: failed to update IndexedDB cache', { error }) |
|
} |
|
} |
|
|
|
// Clear the controller when done |
|
this.backgroundRefreshController = null |
|
} catch (error) { |
|
// Clear the controller on error |
|
this.backgroundRefreshController = null |
|
if (!(error instanceof DOMException && error.name === 'AbortError')) { |
|
logger.error('[RssFeedService] Background refresh failed', { error }) |
|
} |
|
} |
|
} |
|
|
|
/** |
|
* Clear cache for a specific feed or all feeds |
|
*/ |
|
clearCache(url?: string) { |
|
if (url) { |
|
this.feedCache.delete(url) |
|
this.rssFeedAttemptedKeys.delete(this.normalizeRssFeedKeyUrl(url)) |
|
void this.persistRssFeedAttemptedKeys() |
|
// Also clear from IndexedDB (filter by feedUrl) |
|
indexedDb.getRssFeedItems().then(items => { |
|
const filtered = items.filter(item => item.feedUrl !== url) |
|
indexedDb.putRssFeedItems(filtered).catch(err => { |
|
logger.error('[RssFeedService] Failed to clear feed from IndexedDB', { url, error: err }) |
|
}) |
|
}).catch(err => { |
|
logger.error('[RssFeedService] Failed to get items for cache clear', { url, error: err }) |
|
}) |
|
} else { |
|
this.feedCache.clear() |
|
this.rssFeedAttemptedKeys.clear() |
|
void this.persistRssFeedAttemptedKeys() |
|
// Clear all from IndexedDB |
|
indexedDb.clearRssFeedItems().catch(err => { |
|
logger.error('[RssFeedService] Failed to clear IndexedDB cache', { error: err }) |
|
}) |
|
} |
|
} |
|
} |
|
|
|
const instance = new RssFeedService() |
|
export default instance |
|
|
|
|