Browse Source

translation bug-fixes

imwald
Silberengel 2 weeks ago
parent
commit
fd6eea6f12
  1. 35
      src/lib/advanced-lab-markup-protect.test.ts
  2. 55
      src/lib/advanced-lab-markup-protect.ts
  3. 18
      src/lib/translate-client.test.ts
  4. 12
      src/lib/translate-client.ts

35
src/lib/advanced-lab-markup-protect.test.ts

@ -137,6 +137,20 @@ describe('getMarkupProtectRanges', () => { @@ -137,6 +137,20 @@ describe('getMarkupProtectRanges', () => {
expect(rangeIntersectsMerged(t.lastIndexOf(npub), npub.length, merged)).toBe(true)
})
it('markdown: freezes a full raw https URL (blossom npub-shaped host; .gif not translatable)', () => {
const url =
'https://npub1uq6dv4yq94704gk5r22jsqg9gy2wpxkk5dft9q5gugc8tj53nq2qg5q22d.blossom.band/efc560395efdc7327db278ea4a7677905f69fecf1e6db754f41309d62f8ddb23.gif'
const t = `See this\n${url}\nnext`
const merged = getMarkupProtectRanges(t, 'markdown')
expect(
merged.some(([a, b]) => {
const s = t.slice(a, b)
return s.startsWith('https://') && s.endsWith('.gif') && s.includes('blossom.band')
})
).toBe(true)
expect(rangeIntersectsMerged(t.indexOf('.gif'), 4, merged)).toBe(true)
})
it('freezes BOOKSTR_MARKER passthrough and WIKILINK marker', () => {
const book = 'BOOKSTR_MARKER:foo:BOOKSTR_END'
const wiki = 'WIKILINK:my-page[My Page]'
@ -224,6 +238,27 @@ describe('translateAdvancedLabMarkup', () => { @@ -224,6 +238,27 @@ describe('translateAdvancedLabMarkup', () => {
expect(out).toBe('[<Hi>](https://x.com)')
})
it('markdown: does not translate raw https URL (npub-like blossom host / .gif path)', async () => {
const url =
'https://npub1uq6dv4yq94704gk5r22jsqg9gy2wpxkk5dft9q5gugc8tj53nq2qg5q22d.blossom.band/x.gif'
const out = await translateAdvancedLabMarkup(`Before\n${url}\nAfter`, 'de', 'en', 'markdown')
expect(out).toContain(url)
expect(out).toBe('<Before>\n' + url + '\n<After>')
})
it('markdown: freezes #hashtag tokens when mixed with prose', () => {
const t = 'Cool #meme and #memestr stuff'
const merged = getMarkupProtectRanges(t, 'markdown')
expect(rangeIntersectsMerged(t.indexOf('#meme'), 5, merged)).toBe(true)
expect(rangeIntersectsMerged(t.indexOf('#memestr'), 8, merged)).toBe(true)
})
it('markdown: leaves hashtags unchanged inside translated prose', async () => {
const out = await translateAdvancedLabMarkup('Enjoy #meme today', 'de', 'en', 'markdown')
expect(out).toContain('#meme')
expect(out).toMatch(/#meme/)
})
it('markdown: translates optional link title in quotes', async () => {
const out = await translateAdvancedLabMarkup(
'[Hi](https://x.com "Link title")',

55
src/lib/advanced-lab-markup-protect.ts

@ -5,7 +5,9 @@ @@ -5,7 +5,9 @@
* emphasis/strike delimiters, and AsciiDoc blocks, macros, stem, passthrough, xref.
* Also: wiki `[[…]]` (incl. `book::`, `citation::`), `wikilink:`, `BOOKSTR_MARKER:…:BOOKSTR_END`,
* `nostr:…` / bare NIP-19 bech32 (`npub1`, `nprofile1`, etc.), and `link:url[text]` macros.
* Raw `http://` / `https://` URLs (so blossom-style hosts `https://npub1….band/…/file.gif` are not split for translation).
* NIP-style custom/native emoji shortcodes `:shortcode:` (see {@link EMOJI_SHORT_CODE_REGEX}).
* Markdown `#hashtag` tokens (Unicode letters/numbers/mark, `_`, `-`).
*/
import { EMOJI_SHORT_CODE_REGEX } from '@/lib/content-patterns'
@ -762,6 +764,42 @@ function collectWikilinkMarkerRanges(text: string, merged: [number, number][]): @@ -762,6 +764,42 @@ function collectWikilinkMarkerRanges(text: string, merged: [number, number][]):
return ranges
}
/**
* Standalone `http://` / `https://` URLs (not only inside Markdown `[]()`).
* Without this, a host like `npub1….blossom.band/…/x.gif` is split: bare-npub protection covers only the
* first 63 chars after `npub1`, leaving `.gif` and the path in the translatable stream.
*/
function collectRawHttpUrlRanges(text: string, merged: [number, number][]): [number, number][] {
const ranges: [number, number][] = []
let i = 0
while (i < text.length - 7) {
if (posInMerged(i, merged)) {
i++
continue
}
const head = text.slice(i, i + 8).toLowerCase()
const isHttps = head.startsWith('https://')
const isHttp = !isHttps && head.startsWith('http://')
if (!isHttps && !isHttp) {
i++
continue
}
const start = i
i += isHttps ? 8 : 7
let end = i
while (end < text.length) {
const c = text[end]!
if (/\s/.test(c)) break
if (c === '<' || c === '>' || c === '"' || c === "'" || c === '`') break
if (c === ')' || c === ']' || c === '}') break
end++
}
if (end > start) ranges.push([start, end])
i = end
}
return ranges
}
/** AsciiDoc / Markdown `link:url[text]` and `menu:…[…]` (toolbar + jumble). */
function collectLinkMenuColonMacros(text: string, merged: [number, number][]): [number, number][] {
const ranges: [number, number][] = []
@ -868,6 +906,19 @@ function collectAsciiDocXrefRanges(text: string, merged: [number, number][]): [n @@ -868,6 +906,19 @@ function collectAsciiDocXrefRanges(text: string, merged: [number, number][]): [n
return ranges
}
/** `#tag` / `#plebchain` spans; skipped when already inside a frozen range (e.g. URL). */
function collectMarkdownHashtagRanges(text: string, merged: [number, number][]): [number, number][] {
const ranges: [number, number][] = []
const re = /#[\p{L}\p{N}\p{M}_-]+/gu
let m: RegExpExecArray | null
while ((m = re.exec(text)) !== null) {
const start = m.index
if (posInMerged(start, merged)) continue
ranges.push([start, start + m[0].length])
}
return ranges
}
/** `:shortcode:` spans (custom / native emoji); skipped when already inside code, links, etc. */
function collectEmojiShortcodeRanges(text: string, merged: [number, number][]): [number, number][] {
const ranges: [number, number][] = []
@ -896,6 +947,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode @@ -896,6 +947,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode
merged = mergeSortedRanges([...merged, ...bookstrPass, ...wikilinkM])
const linkMenu = collectLinkMenuColonMacros(text, merged)
merged = mergeSortedRanges([...merged, ...linkMenu])
const rawHttpUrls = collectRawHttpUrlRanges(text, merged)
merged = mergeSortedRanges([...merged, ...rawHttpUrls])
const nostrBech = collectNostrAndBech32Ranges(text, merged)
merged = mergeSortedRanges([...merged, ...nostrBech])
const triplePlus = collectAsciiDocTriplePlusPassthrough(text, merged)
@ -912,6 +965,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode @@ -912,6 +965,8 @@ export function getMarkupProtectRanges(text: string, mode: AdvancedLabMarkupMode
const bold = collectMarkdownDelimiterPairEdges(text, mergeSortedRanges([...merged, ...strike]), '**')
const boldU = collectMarkdownDelimiterPairEdges(text, mergeSortedRanges([...merged, ...bold]), '__')
merged = mergeSortedRanges([...merged, ...strike, ...bold, ...boldU])
const hashtags = collectMarkdownHashtagRanges(text, merged)
merged = mergeSortedRanges([...merged, ...hashtags])
} else {
const adocBlocks = collectAsciiDocStructuredBlocks(text, merged)
merged = mergeSortedRanges([...merged, ...adocBlocks])

18
src/lib/translate-client.test.ts

@ -0,0 +1,18 @@ @@ -0,0 +1,18 @@
import { describe, expect, it } from 'vitest'
import { shouldSkipMachineTranslatePlainCore } from '@/lib/translate-client'
describe('shouldSkipMachineTranslatePlainCore', () => {
it('returns true for one or more ASCII hashtags with spaces', () => {
expect(shouldSkipMachineTranslatePlainCore('#meme #memes #memestr #plebchain')).toBe(true)
expect(shouldSkipMachineTranslatePlainCore(' #a #b ')).toBe(true)
})
it('returns false when there is non-hashtag prose', () => {
expect(shouldSkipMachineTranslatePlainCore('#meme is cool')).toBe(false)
expect(shouldSkipMachineTranslatePlainCore('see #meme')).toBe(false)
})
it('returns true for unicode hashtag letters', () => {
expect(shouldSkipMachineTranslatePlainCore('#café #naïve')).toBe(true)
})
})

12
src/lib/translate-client.ts

@ -144,6 +144,14 @@ export function clearTranslateLanguagesCache(): void { @@ -144,6 +144,14 @@ export function clearTranslateLanguagesCache(): void {
advertisedTranslateApiCodes = null
}
/**
* LibreTranslate / Argos often corrupts hashtag-only lines (random glyphs, subtitle-like junk,
* dropped letters). Nostr-style hashtags must stay verbatim.
*/
export function shouldSkipMachineTranslatePlainCore(core: string): boolean {
return /^(?:#[\p{L}\p{N}\p{M}_-]+(?:\s+#[\p{L}\p{N}\p{M}_-]+)*)\s*$/u.test(core.trim())
}
export async function translatePlainText(
text: string,
targetLang: string,
@ -162,6 +170,10 @@ export async function translatePlainText( @@ -162,6 +170,10 @@ export async function translatePlainText(
return text
}
if (shouldSkipMachineTranslatePlainCore(core)) {
return text
}
const resolvedTarget = translateApiLanguageCode(targetLang)
const resolvedSource =
sourceLang === 'auto' ? 'auto' : translateApiLanguageCode(sourceLang)

Loading…
Cancel
Save