Browse Source

adjust og proxy

imwald
Silberengel 2 weeks ago
parent
commit
d2a9af19d2
  1. 70
      src/lib/open-graph.test.ts
  2. 171
      src/lib/open-graph.ts
  3. 183
      src/services/web.service.ts

70
src/lib/open-graph.test.ts

@ -0,0 +1,70 @@
import { describe, expect, it } from 'vitest'
import {
htmlLooksLikeImwaldAppShell,
isImwaldDefaultOpenGraphDescription,
isImwaldDefaultOpenGraphTitle,
parseOpenGraphFromHtml
} from './open-graph'
const IMWALD_INDEX_SNIPPET = `<!doctype html>
<html><head>
<title>Imwald</title>
<meta property="og:title" content="Imwald" />
<meta property="og:description" content="Imwald — a user-friendly Nostr client focused on relay feed browsing, publications, and relay discovery." />
<meta property="og:image" content="https://jumble.imwald.eu/og-image.png" />
</head><body><div id="root"><div id="imwald-boot-splash"></div></div></body></html>`
const FOUNTAIN_SNIPPET = `<!doctype html>
<html><head>
<meta property="og:title" content="Episode Title | Fountain" />
<meta property="og:description" content="A podcast episode" />
<meta property="og:image" content="https://fountain.fm/cover.jpg" />
<meta property="og:audio" content="https://fountain.fm/audio.mp3" />
</head><body></body></html>`
describe('open-graph', () => {
it('detects Imwald app shell HTML', () => {
expect(htmlLooksLikeImwaldAppShell(IMWALD_INDEX_SNIPPET)).toBe(true)
expect(htmlLooksLikeImwaldAppShell(FOUNTAIN_SNIPPET)).toBe(false)
})
it('returns empty metadata for app shell on external URLs', () => {
expect(parseOpenGraphFromHtml(IMWALD_INDEX_SNIPPET, 'https://fountain.fm/episode/x')).toEqual({})
})
it('parses og and twitter tags from a normal page', () => {
expect(parseOpenGraphFromHtml(FOUNTAIN_SNIPPET, 'https://fountain.fm/episode/x')).toEqual({
title: 'Episode Title | Fountain',
description: 'A podcast episode',
image: 'https://fountain.fm/cover.jpg',
audio: 'https://fountain.fm/audio.mp3'
})
})
it('strips Imwald default title even without trailing space', () => {
expect(isImwaldDefaultOpenGraphTitle('Imwald')).toBe(true)
expect(isImwaldDefaultOpenGraphTitle('Episode Title')).toBe(false)
})
it('strips Imwald default description case-insensitively', () => {
expect(
isImwaldDefaultOpenGraphDescription(
'Imwald — a user-friendly Nostr client focused on relay feed browsing.'
)
).toBe(true)
})
it('filters jumble og-image on external hosts while keeping other fields', () => {
const html = `<html><head>
<meta property="og:title" content="Real Site" />
<meta property="og:description" content="About the site" />
<meta property="og:image" content="https://jumble.imwald.eu/og-image.png" />
</head></html>`
expect(parseOpenGraphFromHtml(html, 'https://example.com/page')).toEqual({
title: 'Real Site',
description: 'About the site',
image: undefined,
audio: undefined
})
})
})

171
src/lib/open-graph.ts

@ -0,0 +1,171 @@
import { TWebMetadata } from '@/types'
import logger from '@/lib/logger'
/** True when HTML is the Vite/React dev shell or another SPA stub, not the target page. */
export function htmlLooksLikeLocalDevAppShell(html: string): boolean {
const head = html.slice(0, 8000)
return (
head.includes('injectIntoGlobalHook') ||
head.includes('/@vite/') ||
head.includes('@vite/client') ||
head.includes('@react-refresh')
)
}
/** True when HTML is Imwald's SPA index (served when OG proxy is missing or misrouted). */
export function htmlLooksLikeImwaldAppShell(html: string): boolean {
if (htmlLooksLikeLocalDevAppShell(html)) return true
const head = html.slice(0, 16_000)
if (head.includes('imwald-boot-splash') && head.includes('<title>Imwald</title>')) return true
if (head.includes('jumble.imwald.eu/og-image') && /property="og:title"[^>]*content="Imwald"/i.test(head)) {
return true
}
return false
}
export function isImwaldDefaultOpenGraphTitle(title: string | null | undefined): boolean {
if (!title) return false
const t = title.trim()
return (
/^imwald$/i.test(t) ||
t.includes('Imwald ') ||
/jumble\s*-\s*imwald edition/i.test(t) ||
/jumble imwald edition/i.test(t)
)
}
export function isImwaldDefaultOpenGraphDescription(description: string | null | undefined): boolean {
if (!description) return false
return /user-friendly nostr client focused on relay feed browsing/i.test(description)
}
function metaContent(doc: Document, selectors: string[]): string | undefined {
for (const sel of selectors) {
const el = doc.querySelector(sel)
const v = el?.getAttribute('content') ?? (el as HTMLMetaElement | null)?.content
if (v?.trim()) return v.trim()
}
return undefined
}
function resolveMaybeRelativeUrl(value: string, pageUrl: string): string {
try {
const urlObj = new URL(pageUrl)
if (value.startsWith('/')) {
return `${urlObj.protocol}//${urlObj.host}${value}`
}
if (!value.match(/^https?:\/\//)) {
const basePath = urlObj.pathname.substring(0, urlObj.pathname.lastIndexOf('/') + 1)
return `${urlObj.protocol}//${urlObj.host}${basePath}${value}`
}
return value
} catch {
return value
}
}
function isFaviconOgImage(image: string): boolean {
const imageLower = image.toLowerCase()
return (
imageLower.includes('/favicon') ||
imageLower.endsWith('/favicon.ico') ||
imageLower.endsWith('/favicon.svg')
)
}
/** Parse Open Graph / Twitter / description meta tags from fetched HTML. */
export function parseOpenGraphFromHtml(html: string, pageUrl: string): TWebMetadata {
if (htmlLooksLikeImwaldAppShell(html)) {
logger.debug('[OpenGraph] Ignoring Imwald app shell HTML', { pageUrl })
return {}
}
const parser = new DOMParser()
const doc = parser.parseFromString(html, 'text/html')
let title = metaContent(doc, [
'meta[property="og:title"]',
'meta[name="og:title"]',
'meta[name="twitter:title"]',
'meta[property="twitter:title"]'
])
if (!title) {
const titleTag = doc.querySelector('title')?.textContent?.trim()
if (titleTag) title = titleTag
}
if (title) {
if (
/^(Redirecting|Loading|Please wait|Redirect)(\.\.\.|…)?$/i.test(title) ||
title === '...' ||
title === '…'
) {
title = undefined
}
}
let description = metaContent(doc, [
'meta[property="og:description"]',
'meta[name="og:description"]',
'meta[name="twitter:description"]',
'meta[property="twitter:description"]',
'meta[name="description"]'
])
let image = metaContent(doc, [
'meta[property="og:image"]',
'meta[name="og:image"]',
'meta[property="og:image:url"]',
'meta[property="og:image:secure_url"]',
'meta[name="twitter:image"]',
'meta[property="twitter:image"]'
])
let audio = metaContent(doc, [
'meta[property="og:audio"]',
'meta[property="og:audio:url"]',
'meta[property="og:audio:secure_url"]',
'meta[name="og:audio"]'
])
if (image) {
try {
image = resolveMaybeRelativeUrl(image, pageUrl)
if (isFaviconOgImage(image)) {
logger.warn('[OpenGraph] Filtered favicon from OG image', { pageUrl, image })
image = undefined
}
} catch (error) {
logger.warn('[OpenGraph] Failed to resolve image URL', { image, pageUrl, error })
image = undefined
}
}
if (audio && !audio.match(/^https?:\/\//)) {
try {
audio = resolveMaybeRelativeUrl(audio, pageUrl)
if (!audio.match(/^https?:\/\//)) audio = undefined
} catch {
audio = undefined
}
}
try {
const urlObj = new URL(pageUrl)
const isAppCanonicalHost = urlObj.hostname === 'jumble.imwald.eu'
if (!isAppCanonicalHost) {
if (isImwaldDefaultOpenGraphTitle(title)) title = undefined
if (isImwaldDefaultOpenGraphDescription(description)) description = undefined
if (image?.includes('jumble.imwald.eu/og-image')) image = undefined
if (!title && !description && !image && !audio) {
logger.debug('[OpenGraph] Stripped Imwald default tags for external URL', {
url: pageUrl,
hostname: urlObj.hostname
})
}
}
} catch {
/* ignore */
}
return { title, description, image, audio }
}

183
src/services/web.service.ts

@ -4,6 +4,7 @@ import {
isSitesProxyUnavailableThisSession, isSitesProxyUnavailableThisSession,
markSitesProxyUnavailableFromHttpStatus markSitesProxyUnavailableFromHttpStatus
} from '@/lib/optional-proxy-session' } from '@/lib/optional-proxy-session'
import { htmlLooksLikeImwaldAppShell, parseOpenGraphFromHtml } from '@/lib/open-graph'
import { import {
buildDevLocalSitesFetchUrl, buildDevLocalSitesFetchUrl,
buildViteProxySitesFetchUrl, buildViteProxySitesFetchUrl,
@ -13,17 +14,6 @@ import { TWebMetadata } from '@/types'
import DataLoader from 'dataloader' import DataLoader from 'dataloader'
import logger from '@/lib/logger' import logger from '@/lib/logger'
/** True when HTML is the Vite/React dev shell or another SPA stub, not the target page. */
function htmlLooksLikeLocalDevAppShell(html: string): boolean {
const head = html.slice(0, 8000)
return (
head.includes('injectIntoGlobalHook') ||
head.includes('/@vite/') ||
head.includes('@vite/client') ||
head.includes('@react-refresh')
)
}
const HTML_FETCH_HEADERS = { const HTML_FETCH_HEADERS = {
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (compatible; Imwald/1.0; +https://jumble.imwald.eu)' 'User-Agent': 'Mozilla/5.0 (compatible; Imwald/1.0; +https://jumble.imwald.eu)'
@ -49,146 +39,89 @@ async function tryFetchHtml(
if (!res.ok) return { html: null, status: res.status } if (!res.ok) return { html: null, status: res.status }
const html = await res.text() const html = await res.text()
if (html.length < 50) return { html: null, status: res.status } if (html.length < 50) return { html: null, status: res.status }
if (htmlLooksLikeLocalDevAppShell(html)) return { html: null, status: res.status } if (htmlLooksLikeImwaldAppShell(html)) {
logger.debug('[WebService] Ignoring app-shell HTML from fetch', { fetchUrl })
return { html: null, status: res.status }
}
return { html } return { html }
} catch { } catch {
return { html: null } return { html: null }
} }
} }
/** type OgFetchAttempt = { label: string; url: string; timeoutMs: number; direct?: boolean }
* OG HTML: always use `VITE_PROXY_SERVER` first when set; if that fails or is unset, fetch the page directly.
*/
async function fetchHtmlForOpenGraph(originalUrl: string): Promise<{ html: string; via: string } | null> {
const isAlreadyProxyRequest = urlLooksLikeViteProxyRequest(originalUrl)
if (isAlreadyProxyRequest) {
const { html } = await tryFetchHtml(originalUrl, 35_000)
return html ? { html, via: originalUrl } : null
}
function buildOgFetchAttempts(originalUrl: string): OgFetchAttempt[] {
const attempts: OgFetchAttempt[] = []
const proxyServer = import.meta.env.VITE_PROXY_SERVER?.trim() const proxyServer = import.meta.env.VITE_PROXY_SERVER?.trim()
const proxyDown = isSitesProxyUnavailableThisSession()
if (proxyServer && !isSitesProxyUnavailableThisSession()) { if (proxyServer && !proxyDown && !urlLooksLikeViteProxyRequest(originalUrl)) {
const proxyFetchUrl = buildViteProxySitesFetchUrl(originalUrl, proxyServer) attempts.push({
logger.debug('[WebService] OG fetch via VITE_PROXY_SERVER', { originalUrl, proxyFetchUrl }) label: 'vite-proxy',
const proxyTry = await tryFetchHtml(proxyFetchUrl, 35_000) url: buildViteProxySitesFetchUrl(originalUrl, proxyServer),
if (proxyTry.html) { timeoutMs: 35_000
clearSitesProxyUnavailableThisSession() })
return { html: proxyTry.html, via: proxyFetchUrl }
}
if (typeof proxyTry.status === 'number') {
markSitesProxyUnavailableFromHttpStatus(proxyTry.status)
}
logger.debug('[WebService] OG proxy unavailable or bad response', { originalUrl, status: proxyTry.status })
} }
if (import.meta.env.DEV) { if (import.meta.env.DEV) {
const devSitesUrl = buildDevLocalSitesFetchUrl(originalUrl) const devSitesUrl = buildDevLocalSitesFetchUrl(originalUrl)
if (devSitesUrl && !isSitesProxyUnavailableThisSession()) { if (devSitesUrl && !proxyDown) {
const devTry = await tryFetchHtml(devSitesUrl, 35_000) attempts.push({ label: 'dev-sites', url: devSitesUrl, timeoutMs: 35_000 })
if (devTry.html) {
clearSitesProxyUnavailableThisSession()
return { html: devTry.html, via: devSitesUrl }
}
if (typeof devTry.status === 'number') {
markSitesProxyUnavailableFromHttpStatus(devTry.status)
}
} }
const direct = await tryFetchHtml(originalUrl, 15_000, { direct: true }) attempts.push({ label: 'direct', url: originalUrl, timeoutMs: 15_000, direct: true })
return direct.html ? { html: direct.html, via: 'direct' } : null } else if (!proxyServer || proxyDown) {
attempts.push({ label: 'direct', url: originalUrl, timeoutMs: 15_000, direct: true })
} }
// In production with a configured proxy, skip direct fetch: random sites rarely allow browser CORS, attempts.push(
// and the attempt spams DevTools with cross-origin errors without improving OG success. {
if (proxyServer) { label: 'allorigins',
return null url: `https://api.allorigins.win/raw?url=${encodeURIComponent(originalUrl)}`,
} timeoutMs: 25_000
},
const directOnly = await tryFetchHtml(originalUrl, 15_000, { direct: true }) {
return directOnly.html ? { html: directOnly.html, via: 'direct' } : null label: 'corsproxy',
} url: `https://corsproxy.io/?${encodeURIComponent(originalUrl)}`,
timeoutMs: 25_000
function parseOpenGraphFromHtml(html: string, pageUrl: string): TWebMetadata {
const parser = new DOMParser()
const doc = parser.parseFromString(html, 'text/html')
const ogTitleMeta = doc.querySelector('meta[property="og:title"]')
const titleTag = doc.querySelector('title')
let title = ogTitleMeta?.getAttribute('content') || titleTag?.textContent
if (title) {
const trimmedTitle = title.trim()
if (
/^(Redirecting|Loading|Please wait|Redirect)(\.\.\.|…)?$/i.test(trimmedTitle) ||
trimmedTitle === '...' ||
trimmedTitle === '…'
) {
title = undefined
} }
} )
const description =
doc.querySelector('meta[property="og:description"]')?.getAttribute('content') ||
(doc.querySelector('meta[name="description"]') as HTMLMetaElement | null)?.content
let image = (doc.querySelector('meta[property="og:image"]') as HTMLMetaElement | null)?.content return attempts
}
let audio = /**
doc.querySelector('meta[property="og:audio"]')?.getAttribute('content') || * OG HTML: configured `/sites/?url=…` proxy first; then direct (dev or when proxy is down);
doc.querySelector('meta[property="og:audio:url"]')?.getAttribute('content') || * then public CORS proxies as last resort.
doc.querySelector('meta[property="og:audio:secure_url"]')?.getAttribute('content') || */
null async function fetchHtmlForOpenGraph(originalUrl: string): Promise<{ html: string; via: string } | null> {
if (audio && !audio.match(/^https?:\/\//)) { if (urlLooksLikeViteProxyRequest(originalUrl)) {
audio = null const { html } = await tryFetchHtml(originalUrl, 35_000)
return html ? { html, via: originalUrl } : null
} }
if (image) { for (const attempt of buildOgFetchAttempts(originalUrl)) {
try { logger.debug('[WebService] OG fetch attempt', {
const urlObj = new URL(pageUrl) originalUrl,
if (image.startsWith('/')) { label: attempt.label,
image = `${urlObj.protocol}//${urlObj.host}${image}` fetchUrl: attempt.url
} else if (!image.match(/^https?:\/\//)) { })
const basePath = urlObj.pathname.substring(0, urlObj.pathname.lastIndexOf('/') + 1) const result = await tryFetchHtml(attempt.url, attempt.timeoutMs, { direct: attempt.direct })
image = `${urlObj.protocol}//${urlObj.host}${basePath}${image}` if (result.html) {
} if (attempt.label === 'vite-proxy' || attempt.label === 'dev-sites') {
clearSitesProxyUnavailableThisSession()
const imageLower = image.toLowerCase()
if (
imageLower.includes('/favicon') ||
imageLower.endsWith('/favicon.ico') ||
imageLower.endsWith('/favicon.svg')
) {
logger.warn('[WebService] Filtered out favicon URL from OG image', { url: pageUrl, image })
image = undefined
} }
} catch (error) { return { html: result.html, via: attempt.label }
logger.warn('[WebService] Failed to convert relative image URL', { image, url: pageUrl, error })
} }
} if (
(attempt.label === 'vite-proxy' || attempt.label === 'dev-sites') &&
try { typeof result.status === 'number'
const urlObj = new URL(pageUrl) ) {
const isAppCanonicalHost = urlObj.hostname === 'jumble.imwald.eu' markSitesProxyUnavailableFromHttpStatus(result.status)
const isAppDefaultTitle =
title?.includes('Imwald ') ||
title?.includes('Jumble - Imwald Edition') ||
title?.includes('Jumble Imwald Edition')
const isAppDefaultDesc = description?.includes(
'A user-friendly Nostr client focused on relay feed browsing'
)
if (!isAppCanonicalHost && (isAppDefaultTitle || isAppDefaultDesc)) {
logger.debug('[WebService] Filtered out Imwald default OG tags for external domain', {
url: pageUrl,
hostname: urlObj.hostname
})
return {}
} }
} catch {
/* ignore */
} }
return { title, description, image, audio } return null
} }
class WebService { class WebService {

Loading…
Cancel
Save