nzineRepository->findAll(); foreach ($nzines as $nzine) { if (!$nzine->getFeedUrl()) { continue; } /** @var NzineBot $bot */ $bot = $nzine->getNzineBot(); $bot->setEncryptionService($this->encryptionService); $key = new Key(); $npub = $key->getPublicKey($bot->getNsec()); $articles = $this->entityManager->getRepository(Article::class)->findBy(['pubkey' => $npub]); $io->writeln('Found ' . count($articles) . ' existing articles for bot ' . $npub); $io->section('Fetching RSS for: ' . $nzine->getFeedUrl()); try { $feed = $this->rssFeedService->fetchFeed($nzine->getFeedUrl()); } catch (\Throwable $e) { $io->warning('Failed to fetch ' . $nzine->getFeedUrl() . ': ' . $e->getMessage()); continue; } foreach ($feed['items'] as $item) { try { $event = new Event(); $event->setKind(30023); // NIP-23 Long-form content // created_at — use parsed pubDate (timestamp int) or now $createdAt = isset($item['pubDate']) && is_numeric($item['pubDate']) ? (int)$item['pubDate'] : time(); $event->setCreatedAt($createdAt); // slug (NIP-33 'd' tag) — stable per source item $base = trim(($nzine->getSlug() ?? 'nzine') . '-' . ($item['title'] ?? '')); $slug = (string) $slugger->slug($base)->lower(); // HTML → Markdown $raw = trim($item['content'] ?? '') ?: trim($item['description'] ?? ''); $rawHtml = $this->normalizeWeirdHtml($raw); $cleanHtml = $this->sanitizeHtml($rawHtml); $markdown = $this->htmlToMarkdown($cleanHtml); $event->setContent($markdown); // Tags $tags = [ ['title', $this->safeStr($item['title'] ?? '')], ['d', $slug], ['source', $this->safeStr($item['link'] ?? '')], ]; // summary (short description) $summary = $this->ellipsis($this->plainText($item['description'] ?? ''), 280); if ($summary !== '') { $tags[] = ['summary', $summary]; } // image if (!empty($item['image'])) { $tags[] = ['image', $this->safeStr($item['image'])]; } else { // try to sniff first from content if media tag was missing if (preg_match('~]+src="([^"]+)"~i', $rawHtml, $m)) { $tags[] = ['image', $m[1]]; } } // categories → "t" tags if (!empty($item['categories']) && is_array($item['categories'])) { foreach ($item['categories'] as $category) { $cat = trim((string)$category); if ($cat !== '') { $event->addTag(['t', $cat]); } } } $event->setTags($tags); // Sign event $signer = new Sign(); $signer->signEvent($event, $bot->getNsec()); // Publish (add/adjust relays as you like) try { $this->nostrClient->publishEvent($event, [ 'wss://purplepag.es', 'wss://relay.damus.io', 'wss://nos.lol', ]); $io->writeln('Published long-form event: ' . ($item['title'] ?? '(no title)')); } catch (\Throwable $e) { $io->warning('Publish failed: ' . $e->getMessage()); } // Persist locally $article = $this->factory->createFromLongFormContentEvent((object)$event->toArray()); $this->entityManager->persist($article); } catch (\Throwable $e) { // keep going on item errors $io->warning('Item failed: ' . ($item['title'] ?? '(no title)') . ' — ' . $e->getMessage()); } } $this->entityManager->flush(); $io->success('RSS fetch complete for: ' . $nzine->getFeedUrl()); // --- Update bot profile (kind 0) using feed metadata --- $feedMeta = $feed['feed'] ?? null; if ($feedMeta) { $profile = [ 'name' => $feedMeta['title'] ?? $nzine->getTitle(), 'about' => $feedMeta['description'] ?? '', 'picture' => $feedMeta['image'] ?? null, 'website' => $feedMeta['link'] ?? null, ]; $p = new Event(); $p->setKind(0); $p->setCreatedAt(time()); $p->setContent(json_encode($profile, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE)); $signer = new Sign(); $signer->signEvent($p, $bot->getNsec()); try { $this->nostrClient->publishEvent($p, ['wss://purplepag.es']); $io->success('Published bot profile (kind 0) with feed metadata'); } catch (\Throwable $e) { $io->warning('Failed to publish bot profile event: ' . $e->getMessage()); } } } return Command::SUCCESS; } /** -------- Helpers: HTML prep + converter + small utils -------- */ private function normalizeWeirdHtml(string $html): string { // 1) Unwrap Ghost "HTML cards": keep only the content, drop / wrappers and scripts $html = preg_replace_callback('/.*?/si', function ($m) { $block = $m[0]; // Extract inner … if present if (preg_match('/]*>(.*?)<\/body>/si', $block, $mm)) { $inner = $mm[1]; } else { // No explicit body; just strip the markers $inner = preg_replace('//', '', $block); } return $inner; }, $html); // 2) Nuke any remaining document wrappers that would cut DOM parsing short $html = preg_replace([ '/<\/?html[^>]*>/i', '/<\/?body[^>]*>/i', '/]*>.*?<\/head>/si', ], '', $html); dump($html); return $html; } private function sanitizeHtml(string $html): string { if ($html === '') return $html; // 0) quick pre-clean: kill scripts/styles early to avoid DOM bloat $html = preg_replace('~<(script|style)\b[^>]*>.*?~is', '', $html); $html = preg_replace('~~s', '', $html); // comments // 1) Normalize weird widgets and wrappers BEFORE DOM parse // lightning-widget → simple text $html = preg_replace_callback( '~]*\bto="([^"]+)"[^>]*>.*?~is', fn($m) => '

⚡ Tips: ' . htmlspecialchars($m[1]) . '

', $html ); // Ghost/Koenig wrappers: keep useful inner content $html = preg_replace('~]*\bkg-image-card\b[^>]*>\s*(]+>)\s*~i', '$1', $html); $html = preg_replace('~]*\bkg-callout-card\b[^>]*>(.*?)~is', '
$1
', $html); // YouTube iframes → links $html = preg_replace_callback( '~]+src="https?://www\.youtube\.com/embed/([A-Za-z0-9_\-]+)[^"]*"[^>]*>~i', fn($m) => '

Watch on YouTube

', $html ); // 2) Try to pretty up malformed markup via Tidy (if available) if (function_exists('tidy_parse_string')) { try { $tidy = tidy_parse_string($html, [ 'clean' => true, 'output-xhtml' => true, 'show-body-only' => false, 'wrap' => 0, 'drop-empty-paras' => true, 'merge-divs' => true, 'merge-spans' => true, 'numeric-entities' => false, 'quote-ampersand' => true, ], 'utf8'); $tidy->cleanRepair(); $html = (string)$tidy; } catch (\Throwable $e) { // ignore tidy failures } } // 3) DOM sanitize: remove junk, unwrap html/body/head, allowlist elements/attrs $dom = new \DOMDocument('1.0', 'UTF-8'); libxml_use_internal_errors(true); $loaded = $dom->loadHTML( // force UTF-8 meta so DOMDocument doesn't mangle ''.$html, LIBXML_NOWARNING | LIBXML_NOERROR ); libxml_clear_errors(); if (!$loaded) { // fallback: as-is minus tags we already stripped return $html; } $xpath = new \DOMXPath($dom); // Remove ,