translation bug-fixes

2 months ago · fd6eea6f12
4 changed files with 120 additions and 0 deletions
--- a/src/lib/advanced-lab-markup-protect.test.ts
+++ b/src/lib/advanced-lab-markup-protect.test.ts
@ -137,6 +137,20 @@ describe('getMarkupProtectRanges', () => {
				@@ -137,6 +137,20 @@ describe('getMarkupProtectRanges', () => {
    expect(rangeIntersectsMerged(t.lastIndexOf(npub), npub.length, merged)).toBe(true)
  })

+  it('markdown: freezes a full raw https URL (blossom npub-shaped host; .gif not translatable)', () => {
+    const url =
+      'https://npub1uq6dv4yq94704gk5r22jsqg9gy2wpxkk5dft9q5gugc8tj53nq2qg5q22d.blossom.band/efc560395efdc7327db278ea4a7677905f69fecf1e6db754f41309d62f8ddb23.gif'
+    const t = `See this\n${url}\nnext`
+    const merged = getMarkupProtectRanges(t, 'markdown')
+    expect(
+      merged.some(([a, b]) => {
+        const s = t.slice(a, b)
+        return s.startsWith('https://') && s.endsWith('.gif') && s.includes('blossom.band')
+      })
+    ).toBe(true)
+    expect(rangeIntersectsMerged(t.indexOf('.gif'), 4, merged)).toBe(true)
+  })
+
  it('freezes BOOKSTR_MARKER passthrough and WIKILINK marker', () => {
    const book = 'BOOKSTR_MARKER:foo:BOOKSTR_END'
    const wiki = 'WIKILINK:my-page[My Page]'
@ -224,6 +238,27 @@ describe('translateAdvancedLabMarkup', () => {
				@@ -224,6 +238,27 @@ describe('translateAdvancedLabMarkup', () => {
    expect(out).toBe('[<Hi>](https://x.com)')
  })

+  it('markdown: does not translate raw https URL (npub-like blossom host / .gif path)', async () => {
+    const url =
+      'https://npub1uq6dv4yq94704gk5r22jsqg9gy2wpxkk5dft9q5gugc8tj53nq2qg5q22d.blossom.band/x.gif'
+    const out = await translateAdvancedLabMarkup(`Before\n${url}\nAfter`, 'de', 'en', 'markdown')
+    expect(out).toContain(url)
+    expect(out).toBe('<Before>\n' + url + '\n<After>')
+  })
+
+  it('markdown: freezes #hashtag tokens when mixed with prose', () => {
+    const t = 'Cool #meme and #memestr stuff'
+    const merged = getMarkupProtectRanges(t, 'markdown')
+    expect(rangeIntersectsMerged(t.indexOf('#meme'), 5, merged)).toBe(true)
+    expect(rangeIntersectsMerged(t.indexOf('#memestr'), 8, merged)).toBe(true)
+  })
+
+  it('markdown: leaves hashtags unchanged inside translated prose', async () => {
+    const out = await translateAdvancedLabMarkup('Enjoy #meme today', 'de', 'en', 'markdown')
+    expect(out).toContain('#meme')
+    expect(out).toMatch(/#meme/)
+  })
+
  it('markdown: translates optional link title in quotes', async () => {
    const out = await translateAdvancedLabMarkup(
      '[Hi](https://x.com "Link title")',
--- a/src/lib/advanced-lab-markup-protect.ts
+++ b/src/lib/advanced-lab-markup-protect.ts
@ -5,7 +5,9 @@
				@@ -5,7 +5,9 @@
 * emphasis/strike delimiters, and AsciiDoc blocks, macros, stem, passthrough, xref.
 * Also: wiki `[[…]]` (incl. `book::`, `citation::`), `wikilink:`, `BOOKSTR_MARKER:…:BOOKSTR_END`,
 * `nostr:…` / bare NIP-19 bech32 (`npub1`…, `nprofile1`…, etc.), and `link:url[text]` macros.
+ * Raw `http://` / `https://` URLs (so blossom-style hosts `https://npub1….band/…/file.gif` are not split for translation).
 * NIP-style custom/native emoji shortcodes `:shortcode:` (see {@link EMOJI_SHORT_CODE_REGEX}).
+ * Markdown `#hashtag` tokens (Unicode letters/numbers/mark, `_`, `-`).
 */

 import { EMOJI_SHORT_CODE_REGEX } from '@/lib/content-patterns'
@ -762,6 +764,42 @@ function collectWikilinkMarkerRanges(text: string, merged: [number, number][]):
				@@ -762,6 +764,42 @@ function collectWikilinkMarkerRanges(text: string, merged: [number, number][]):
  return ranges
 }

+/**
+ * Standalone `http://` / `https://` URLs (not only inside Markdown `[]()`).
+ * Without this, a host like `npub1….blossom.band/…/x.gif` is split: bare-npub protection covers only the
+ * first 63 chars after `npub1`, leaving `.gif` and the path in the translatable stream.
+ */
+function collectRawHttpUrlRanges(text: string, merged: [number, number][]): [number, number][] {
+  const ranges: [number, number][] = []
+  let i = 0
+  while (i < text.length - 7) {
+    if (posInMerged(i, merged)) {
+      i++
+      continue
+    }
+    const head = text.slice(i, i + 8).toLowerCase()
+    const isHttps = head.startsWith('https://')
+    const isHttp = !isHttps && head.startsWith('http://')
+    if (!isHttps && !isHttp) {
+      i++
+      continue
+    }
+    const start = i
+    i += isHttps ? 8 : 7
+    let end = i
+    while (end < text.length) {
+      const c = text[end]!
+      if (/\s/.test(c)) break
+      if (c === '<' || c === '>' || c === '"' || c === "'" || c === '`') break
+      if (c === ')' || c === ']' || c === '}') break
+      end++
+    }
+    if (end > start) ranges.push([start, end])
+    i = end
+  }
+  return ranges
+}
+
 /** AsciiDoc / Markdown `link:url[text]` and `menu:…[…]` (toolbar + jumble). */
 function collectLinkMenuColonMacros(text: string, merged: [number, number][]): [number, number][] {
  const ranges: [number, number][] = []
@ -868,6 +906,19 @@ function collectAsciiDocXrefRanges(text: string, merged: [number, number][]): [n
				@@ -868,6 +906,19 @@ function collectAsciiDocXrefRanges(text: string, merged: [number, number][]): [n
  return ranges
 }

+/** `#tag` / `#plebchain` spans; skipped when already inside a frozen range (e.g. URL). */
+function collectMarkdownHashtagRanges(text: string, merged: [number, number][]): [number, number][] {
+  const ranges: [number, number][] = []
+  const re = /#[\p{L}\p{N}\p{M}_-]+/gu
+  let m: RegExpExecArray | null
+  while ((m = re.exec(text)) !== null) {
+    const start = m.index
+    if (posInMerged(start, merged)) continue
+    ranges.push([start, start + m[0].length])
+  }
+  return ranges
+}
+
 /** `:shortcode:` spans (custom / native emoji); skipped when already inside code, links, etc. */
 function collectEmojiShortcodeRanges(text: string, merged: [number, number][]): [number, number][] {
  const ranges: [number, number][] = []
@ -896,6 +947,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode
				@@ -896,6 +947,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode
  merged = mergeSortedRanges([...merged, ...bookstrPass, ...wikilinkM])
  const linkMenu = collectLinkMenuColonMacros(text, merged)
  merged = mergeSortedRanges([...merged, ...linkMenu])
+  const rawHttpUrls = collectRawHttpUrlRanges(text, merged)
+  merged = mergeSortedRanges([...merged, ...rawHttpUrls])
  const nostrBech = collectNostrAndBech32Ranges(text, merged)
  merged = mergeSortedRanges([...merged, ...nostrBech])
  const triplePlus = collectAsciiDocTriplePlusPassthrough(text, merged)
@ -912,6 +965,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode
				@@ -912,6 +965,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode
    const bold = collectMarkdownDelimiterPairEdges(text, mergeSortedRanges([...merged, ...strike]), '**')
    const boldU = collectMarkdownDelimiterPairEdges(text, mergeSortedRanges([...merged, ...bold]), '__')
    merged = mergeSortedRanges([...merged, ...strike, ...bold, ...boldU])
+    const hashtags = collectMarkdownHashtagRanges(text, merged)
+    merged = mergeSortedRanges([...merged, ...hashtags])
  } else {
    const adocBlocks = collectAsciiDocStructuredBlocks(text, merged)
    merged = mergeSortedRanges([...merged, ...adocBlocks])
--- a/src/lib/translate-client.test.ts
+++ b/src/lib/translate-client.test.ts
@ -0,0 +1,18 @@
				@@ -0,0 +1,18 @@
+import { describe, expect, it } from 'vitest'
+import { shouldSkipMachineTranslatePlainCore } from '@/lib/translate-client'
+
+describe('shouldSkipMachineTranslatePlainCore', () => {
+  it('returns true for one or more ASCII hashtags with spaces', () => {
+    expect(shouldSkipMachineTranslatePlainCore('#meme #memes #memestr #plebchain')).toBe(true)
+    expect(shouldSkipMachineTranslatePlainCore('  #a #b  ')).toBe(true)
+  })
+
+  it('returns false when there is non-hashtag prose', () => {
+    expect(shouldSkipMachineTranslatePlainCore('#meme is cool')).toBe(false)
+    expect(shouldSkipMachineTranslatePlainCore('see #meme')).toBe(false)
+  })
+
+  it('returns true for unicode hashtag letters', () => {
+    expect(shouldSkipMachineTranslatePlainCore('#café #naïve')).toBe(true)
+  })
+})
--- a/src/lib/translate-client.ts
+++ b/src/lib/translate-client.ts
@ -144,6 +144,14 @@ export function clearTranslateLanguagesCache(): void {
				@@ -144,6 +144,14 @@ export function clearTranslateLanguagesCache(): void {
  advertisedTranslateApiCodes = null
 }

+/**
+ * LibreTranslate / Argos often corrupts hashtag-only lines (random glyphs, subtitle-like junk,
+ * dropped letters). Nostr-style hashtags must stay verbatim.
+ */
+export function shouldSkipMachineTranslatePlainCore(core: string): boolean {
+  return /^(?:#[\p{L}\p{N}\p{M}_-]+(?:\s+#[\p{L}\p{N}\p{M}_-]+)*)\s*$/u.test(core.trim())
+}
+
 export async function translatePlainText(
  text: string,
  targetLang: string,
@ -162,6 +170,10 @@ export async function translatePlainText(
				@@ -162,6 +170,10 @@ export async function translatePlainText(
    return text
  }

+  if (shouldSkipMachineTranslatePlainCore(core)) {
+    return text
+  }
+
  const resolvedTarget = translateApiLanguageCode(targetLang)
  const resolvedSource =
    sourceLang === 'auto' ? 'auto' : translateApiLanguageCode(sourceLang)