From d2a9af19d2a6e67403129619e2c5dbf2f9664017 Mon Sep 17 00:00:00 2001 From: Silberengel Date: Mon, 1 Jun 2026 20:36:26 +0200 Subject: [PATCH] adjust og proxy --- src/lib/open-graph.test.ts | 70 ++++++++++++++ src/lib/open-graph.ts | 171 +++++++++++++++++++++++++++++++++ src/services/web.service.ts | 183 ++++++++++++------------------------ 3 files changed, 299 insertions(+), 125 deletions(-) create mode 100644 src/lib/open-graph.test.ts create mode 100644 src/lib/open-graph.ts diff --git a/src/lib/open-graph.test.ts b/src/lib/open-graph.test.ts new file mode 100644 index 00000000..99b677c8 --- /dev/null +++ b/src/lib/open-graph.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from 'vitest' +import { + htmlLooksLikeImwaldAppShell, + isImwaldDefaultOpenGraphDescription, + isImwaldDefaultOpenGraphTitle, + parseOpenGraphFromHtml +} from './open-graph' + +const IMWALD_INDEX_SNIPPET = ` + +Imwald + + + +
` + +const FOUNTAIN_SNIPPET = ` + + + + + +` + +describe('open-graph', () => { + it('detects Imwald app shell HTML', () => { + expect(htmlLooksLikeImwaldAppShell(IMWALD_INDEX_SNIPPET)).toBe(true) + expect(htmlLooksLikeImwaldAppShell(FOUNTAIN_SNIPPET)).toBe(false) + }) + + it('returns empty metadata for app shell on external URLs', () => { + expect(parseOpenGraphFromHtml(IMWALD_INDEX_SNIPPET, 'https://fountain.fm/episode/x')).toEqual({}) + }) + + it('parses og and twitter tags from a normal page', () => { + expect(parseOpenGraphFromHtml(FOUNTAIN_SNIPPET, 'https://fountain.fm/episode/x')).toEqual({ + title: 'Episode Title | Fountain', + description: 'A podcast episode', + image: 'https://fountain.fm/cover.jpg', + audio: 'https://fountain.fm/audio.mp3' + }) + }) + + it('strips Imwald default title even without trailing space', () => { + expect(isImwaldDefaultOpenGraphTitle('Imwald')).toBe(true) + expect(isImwaldDefaultOpenGraphTitle('Episode Title')).toBe(false) + }) + + it('strips Imwald default description case-insensitively', () => { + expect( + isImwaldDefaultOpenGraphDescription( + 'Imwald — a user-friendly Nostr client focused on relay feed browsing.' + ) + ).toBe(true) + }) + + it('filters jumble og-image on external hosts while keeping other fields', () => { + const html = ` + + + +` + expect(parseOpenGraphFromHtml(html, 'https://example.com/page')).toEqual({ + title: 'Real Site', + description: 'About the site', + image: undefined, + audio: undefined + }) + }) +}) diff --git a/src/lib/open-graph.ts b/src/lib/open-graph.ts new file mode 100644 index 00000000..e285ee9f --- /dev/null +++ b/src/lib/open-graph.ts @@ -0,0 +1,171 @@ +import { TWebMetadata } from '@/types' +import logger from '@/lib/logger' + +/** True when HTML is the Vite/React dev shell or another SPA stub, not the target page. */ +export function htmlLooksLikeLocalDevAppShell(html: string): boolean { + const head = html.slice(0, 8000) + return ( + head.includes('injectIntoGlobalHook') || + head.includes('/@vite/') || + head.includes('@vite/client') || + head.includes('@react-refresh') + ) +} + +/** True when HTML is Imwald's SPA index (served when OG proxy is missing or misrouted). */ +export function htmlLooksLikeImwaldAppShell(html: string): boolean { + if (htmlLooksLikeLocalDevAppShell(html)) return true + const head = html.slice(0, 16_000) + if (head.includes('imwald-boot-splash') && head.includes('Imwald')) return true + if (head.includes('jumble.imwald.eu/og-image') && /property="og:title"[^>]*content="Imwald"/i.test(head)) { + return true + } + return false +} + +export function isImwaldDefaultOpenGraphTitle(title: string | null | undefined): boolean { + if (!title) return false + const t = title.trim() + return ( + /^imwald$/i.test(t) || + t.includes('Imwald ') || + /jumble\s*-\s*imwald edition/i.test(t) || + /jumble imwald edition/i.test(t) + ) +} + +export function isImwaldDefaultOpenGraphDescription(description: string | null | undefined): boolean { + if (!description) return false + return /user-friendly nostr client focused on relay feed browsing/i.test(description) +} + +function metaContent(doc: Document, selectors: string[]): string | undefined { + for (const sel of selectors) { + const el = doc.querySelector(sel) + const v = el?.getAttribute('content') ?? (el as HTMLMetaElement | null)?.content + if (v?.trim()) return v.trim() + } + return undefined +} + +function resolveMaybeRelativeUrl(value: string, pageUrl: string): string { + try { + const urlObj = new URL(pageUrl) + if (value.startsWith('/')) { + return `${urlObj.protocol}//${urlObj.host}${value}` + } + if (!value.match(/^https?:\/\//)) { + const basePath = urlObj.pathname.substring(0, urlObj.pathname.lastIndexOf('/') + 1) + return `${urlObj.protocol}//${urlObj.host}${basePath}${value}` + } + return value + } catch { + return value + } +} + +function isFaviconOgImage(image: string): boolean { + const imageLower = image.toLowerCase() + return ( + imageLower.includes('/favicon') || + imageLower.endsWith('/favicon.ico') || + imageLower.endsWith('/favicon.svg') + ) +} + +/** Parse Open Graph / Twitter / description meta tags from fetched HTML. */ +export function parseOpenGraphFromHtml(html: string, pageUrl: string): TWebMetadata { + if (htmlLooksLikeImwaldAppShell(html)) { + logger.debug('[OpenGraph] Ignoring Imwald app shell HTML', { pageUrl }) + return {} + } + + const parser = new DOMParser() + const doc = parser.parseFromString(html, 'text/html') + + let title = metaContent(doc, [ + 'meta[property="og:title"]', + 'meta[name="og:title"]', + 'meta[name="twitter:title"]', + 'meta[property="twitter:title"]' + ]) + if (!title) { + const titleTag = doc.querySelector('title')?.textContent?.trim() + if (titleTag) title = titleTag + } + if (title) { + if ( + /^(Redirecting|Loading|Please wait|Redirect)(\.\.\.|…)?$/i.test(title) || + title === '...' || + title === '…' + ) { + title = undefined + } + } + + let description = metaContent(doc, [ + 'meta[property="og:description"]', + 'meta[name="og:description"]', + 'meta[name="twitter:description"]', + 'meta[property="twitter:description"]', + 'meta[name="description"]' + ]) + + let image = metaContent(doc, [ + 'meta[property="og:image"]', + 'meta[name="og:image"]', + 'meta[property="og:image:url"]', + 'meta[property="og:image:secure_url"]', + 'meta[name="twitter:image"]', + 'meta[property="twitter:image"]' + ]) + + let audio = metaContent(doc, [ + 'meta[property="og:audio"]', + 'meta[property="og:audio:url"]', + 'meta[property="og:audio:secure_url"]', + 'meta[name="og:audio"]' + ]) + + if (image) { + try { + image = resolveMaybeRelativeUrl(image, pageUrl) + if (isFaviconOgImage(image)) { + logger.warn('[OpenGraph] Filtered favicon from OG image', { pageUrl, image }) + image = undefined + } + } catch (error) { + logger.warn('[OpenGraph] Failed to resolve image URL', { image, pageUrl, error }) + image = undefined + } + } + + if (audio && !audio.match(/^https?:\/\//)) { + try { + audio = resolveMaybeRelativeUrl(audio, pageUrl) + if (!audio.match(/^https?:\/\//)) audio = undefined + } catch { + audio = undefined + } + } + + try { + const urlObj = new URL(pageUrl) + const isAppCanonicalHost = urlObj.hostname === 'jumble.imwald.eu' + if (!isAppCanonicalHost) { + if (isImwaldDefaultOpenGraphTitle(title)) title = undefined + if (isImwaldDefaultOpenGraphDescription(description)) description = undefined + if (image?.includes('jumble.imwald.eu/og-image')) image = undefined + if (!title && !description && !image && !audio) { + logger.debug('[OpenGraph] Stripped Imwald default tags for external URL', { + url: pageUrl, + hostname: urlObj.hostname + }) + } + } + } catch { + /* ignore */ + } + + return { title, description, image, audio } +} diff --git a/src/services/web.service.ts b/src/services/web.service.ts index 41e42d59..b03902d8 100644 --- a/src/services/web.service.ts +++ b/src/services/web.service.ts @@ -4,6 +4,7 @@ import { isSitesProxyUnavailableThisSession, markSitesProxyUnavailableFromHttpStatus } from '@/lib/optional-proxy-session' +import { htmlLooksLikeImwaldAppShell, parseOpenGraphFromHtml } from '@/lib/open-graph' import { buildDevLocalSitesFetchUrl, buildViteProxySitesFetchUrl, @@ -13,17 +14,6 @@ import { TWebMetadata } from '@/types' import DataLoader from 'dataloader' import logger from '@/lib/logger' -/** True when HTML is the Vite/React dev shell or another SPA stub, not the target page. */ -function htmlLooksLikeLocalDevAppShell(html: string): boolean { - const head = html.slice(0, 8000) - return ( - head.includes('injectIntoGlobalHook') || - head.includes('/@vite/') || - head.includes('@vite/client') || - head.includes('@react-refresh') - ) -} - const HTML_FETCH_HEADERS = { Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (compatible; Imwald/1.0; +https://jumble.imwald.eu)' @@ -49,146 +39,89 @@ async function tryFetchHtml( if (!res.ok) return { html: null, status: res.status } const html = await res.text() if (html.length < 50) return { html: null, status: res.status } - if (htmlLooksLikeLocalDevAppShell(html)) return { html: null, status: res.status } + if (htmlLooksLikeImwaldAppShell(html)) { + logger.debug('[WebService] Ignoring app-shell HTML from fetch', { fetchUrl }) + return { html: null, status: res.status } + } return { html } } catch { return { html: null } } } -/** - * OG HTML: always use `VITE_PROXY_SERVER` first when set; if that fails or is unset, fetch the page directly. - */ -async function fetchHtmlForOpenGraph(originalUrl: string): Promise<{ html: string; via: string } | null> { - const isAlreadyProxyRequest = urlLooksLikeViteProxyRequest(originalUrl) - - if (isAlreadyProxyRequest) { - const { html } = await tryFetchHtml(originalUrl, 35_000) - return html ? { html, via: originalUrl } : null - } +type OgFetchAttempt = { label: string; url: string; timeoutMs: number; direct?: boolean } +function buildOgFetchAttempts(originalUrl: string): OgFetchAttempt[] { + const attempts: OgFetchAttempt[] = [] const proxyServer = import.meta.env.VITE_PROXY_SERVER?.trim() + const proxyDown = isSitesProxyUnavailableThisSession() - if (proxyServer && !isSitesProxyUnavailableThisSession()) { - const proxyFetchUrl = buildViteProxySitesFetchUrl(originalUrl, proxyServer) - logger.debug('[WebService] OG fetch via VITE_PROXY_SERVER', { originalUrl, proxyFetchUrl }) - const proxyTry = await tryFetchHtml(proxyFetchUrl, 35_000) - if (proxyTry.html) { - clearSitesProxyUnavailableThisSession() - return { html: proxyTry.html, via: proxyFetchUrl } - } - if (typeof proxyTry.status === 'number') { - markSitesProxyUnavailableFromHttpStatus(proxyTry.status) - } - logger.debug('[WebService] OG proxy unavailable or bad response', { originalUrl, status: proxyTry.status }) + if (proxyServer && !proxyDown && !urlLooksLikeViteProxyRequest(originalUrl)) { + attempts.push({ + label: 'vite-proxy', + url: buildViteProxySitesFetchUrl(originalUrl, proxyServer), + timeoutMs: 35_000 + }) } if (import.meta.env.DEV) { const devSitesUrl = buildDevLocalSitesFetchUrl(originalUrl) - if (devSitesUrl && !isSitesProxyUnavailableThisSession()) { - const devTry = await tryFetchHtml(devSitesUrl, 35_000) - if (devTry.html) { - clearSitesProxyUnavailableThisSession() - return { html: devTry.html, via: devSitesUrl } - } - if (typeof devTry.status === 'number') { - markSitesProxyUnavailableFromHttpStatus(devTry.status) - } + if (devSitesUrl && !proxyDown) { + attempts.push({ label: 'dev-sites', url: devSitesUrl, timeoutMs: 35_000 }) } - const direct = await tryFetchHtml(originalUrl, 15_000, { direct: true }) - return direct.html ? { html: direct.html, via: 'direct' } : null + attempts.push({ label: 'direct', url: originalUrl, timeoutMs: 15_000, direct: true }) + } else if (!proxyServer || proxyDown) { + attempts.push({ label: 'direct', url: originalUrl, timeoutMs: 15_000, direct: true }) } - // In production with a configured proxy, skip direct fetch: random sites rarely allow browser CORS, - // and the attempt spams DevTools with cross-origin errors without improving OG success. - if (proxyServer) { - return null - } - - const directOnly = await tryFetchHtml(originalUrl, 15_000, { direct: true }) - return directOnly.html ? { html: directOnly.html, via: 'direct' } : null -} - -function parseOpenGraphFromHtml(html: string, pageUrl: string): TWebMetadata { - const parser = new DOMParser() - const doc = parser.parseFromString(html, 'text/html') - - const ogTitleMeta = doc.querySelector('meta[property="og:title"]') - const titleTag = doc.querySelector('title') - - let title = ogTitleMeta?.getAttribute('content') || titleTag?.textContent - if (title) { - const trimmedTitle = title.trim() - if ( - /^(Redirecting|Loading|Please wait|Redirect)(\.\.\.|…)?$/i.test(trimmedTitle) || - trimmedTitle === '...' || - trimmedTitle === '…' - ) { - title = undefined + attempts.push( + { + label: 'allorigins', + url: `https://api.allorigins.win/raw?url=${encodeURIComponent(originalUrl)}`, + timeoutMs: 25_000 + }, + { + label: 'corsproxy', + url: `https://corsproxy.io/?${encodeURIComponent(originalUrl)}`, + timeoutMs: 25_000 } - } - - const description = - doc.querySelector('meta[property="og:description"]')?.getAttribute('content') || - (doc.querySelector('meta[name="description"]') as HTMLMetaElement | null)?.content + ) - let image = (doc.querySelector('meta[property="og:image"]') as HTMLMetaElement | null)?.content + return attempts +} - let audio = - doc.querySelector('meta[property="og:audio"]')?.getAttribute('content') || - doc.querySelector('meta[property="og:audio:url"]')?.getAttribute('content') || - doc.querySelector('meta[property="og:audio:secure_url"]')?.getAttribute('content') || - null - if (audio && !audio.match(/^https?:\/\//)) { - audio = null +/** + * OG HTML: configured `/sites/?url=…` proxy first; then direct (dev or when proxy is down); + * then public CORS proxies as last resort. + */ +async function fetchHtmlForOpenGraph(originalUrl: string): Promise<{ html: string; via: string } | null> { + if (urlLooksLikeViteProxyRequest(originalUrl)) { + const { html } = await tryFetchHtml(originalUrl, 35_000) + return html ? { html, via: originalUrl } : null } - if (image) { - try { - const urlObj = new URL(pageUrl) - if (image.startsWith('/')) { - image = `${urlObj.protocol}//${urlObj.host}${image}` - } else if (!image.match(/^https?:\/\//)) { - const basePath = urlObj.pathname.substring(0, urlObj.pathname.lastIndexOf('/') + 1) - image = `${urlObj.protocol}//${urlObj.host}${basePath}${image}` - } - - const imageLower = image.toLowerCase() - if ( - imageLower.includes('/favicon') || - imageLower.endsWith('/favicon.ico') || - imageLower.endsWith('/favicon.svg') - ) { - logger.warn('[WebService] Filtered out favicon URL from OG image', { url: pageUrl, image }) - image = undefined + for (const attempt of buildOgFetchAttempts(originalUrl)) { + logger.debug('[WebService] OG fetch attempt', { + originalUrl, + label: attempt.label, + fetchUrl: attempt.url + }) + const result = await tryFetchHtml(attempt.url, attempt.timeoutMs, { direct: attempt.direct }) + if (result.html) { + if (attempt.label === 'vite-proxy' || attempt.label === 'dev-sites') { + clearSitesProxyUnavailableThisSession() } - } catch (error) { - logger.warn('[WebService] Failed to convert relative image URL', { image, url: pageUrl, error }) + return { html: result.html, via: attempt.label } } - } - - try { - const urlObj = new URL(pageUrl) - const isAppCanonicalHost = urlObj.hostname === 'jumble.imwald.eu' - const isAppDefaultTitle = - title?.includes('Imwald ') || - title?.includes('Jumble - Imwald Edition') || - title?.includes('Jumble Imwald Edition') - const isAppDefaultDesc = description?.includes( - 'A user-friendly Nostr client focused on relay feed browsing' - ) - if (!isAppCanonicalHost && (isAppDefaultTitle || isAppDefaultDesc)) { - logger.debug('[WebService] Filtered out Imwald default OG tags for external domain', { - url: pageUrl, - hostname: urlObj.hostname - }) - return {} + if ( + (attempt.label === 'vite-proxy' || attempt.label === 'dev-sites') && + typeof result.status === 'number' + ) { + markSitesProxyUnavailableFromHttpStatus(result.status) } - } catch { - /* ignore */ } - return { title, description, image, audio } + return null } class WebService {