clone of github.com/decent-newsroom/newsroom
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

386 lines
15 KiB

<?php
namespace App\Command;
use App\Entity\Article;
use App\Entity\NzineBot;
use App\Factory\ArticleFactory;
use App\Repository\NzineRepository;
use App\Service\EncryptionService;
use App\Service\NostrClient;
use App\Service\RssFeedService;
use Doctrine\ORM\EntityManagerInterface;
use League\HTMLToMarkdown\HtmlConverter;
use swentel\nostr\Event\Event;
use swentel\nostr\Key\Key;
use swentel\nostr\Sign\Sign;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Component\String\Slugger\AsciiSlugger;
#[AsCommand(
name: 'nzine:rss:fetch',
description: 'Fetch RSS feeds and save new articles for configured nzines',
)]
class RssFetchCommand extends Command
{
public function __construct(
private readonly NzineRepository $nzineRepository,
private readonly ArticleFactory $factory,
private readonly RssFeedService $rssFeedService,
private readonly EntityManagerInterface $entityManager,
private readonly NostrClient $nostrClient,
private readonly EncryptionService $encryptionService
) {
parent::__construct();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
$slugger = new AsciiSlugger();
$nzines = $this->nzineRepository->findAll();
foreach ($nzines as $nzine) {
if (!$nzine->getFeedUrl()) {
continue;
}
/** @var NzineBot $bot */
$bot = $nzine->getNzineBot();
$bot->setEncryptionService($this->encryptionService);
$key = new Key();
$npub = $key->getPublicKey($bot->getNsec());
$articles = $this->entityManager->getRepository(Article::class)->findBy(['pubkey' => $npub]);
$io->writeln('Found ' . count($articles) . ' existing articles for bot ' . $npub);
$io->section('Fetching RSS for: ' . $nzine->getFeedUrl());
try {
$feed = $this->rssFeedService->fetchFeed($nzine->getFeedUrl());
} catch (\Throwable $e) {
$io->warning('Failed to fetch ' . $nzine->getFeedUrl() . ': ' . $e->getMessage());
continue;
}
foreach ($feed['items'] as $item) {
try {
$event = new Event();
$event->setKind(30023); // NIP-23 Long-form content
// created_at — use parsed pubDate (timestamp int) or now
$createdAt = isset($item['pubDate']) && is_numeric($item['pubDate'])
? (int)$item['pubDate']
: time();
$event->setCreatedAt($createdAt);
// slug (NIP-33 'd' tag) — stable per source item
$base = trim(($nzine->getSlug() ?? 'nzine') . '-' . ($item['title'] ?? ''));
$slug = (string) $slugger->slug($base)->lower();
// HTML → Markdown
$raw = trim($item['content'] ?? '') ?: trim($item['description'] ?? '');
$rawHtml = $this->normalizeWeirdHtml($raw);
$cleanHtml = $this->sanitizeHtml($rawHtml);
$markdown = $this->htmlToMarkdown($cleanHtml);
$event->setContent($markdown);
// Tags
$tags = [
['title', $this->safeStr($item['title'] ?? '')],
['d', $slug],
['source', $this->safeStr($item['link'] ?? '')],
];
// summary (short description)
$summary = $this->ellipsis($this->plainText($item['description'] ?? ''), 280);
if ($summary !== '') {
$tags[] = ['summary', $summary];
}
// image
if (!empty($item['image'])) {
$tags[] = ['image', $this->safeStr($item['image'])];
} else {
// try to sniff first <img> from content if media tag was missing
if (preg_match('~<img[^>]+src="([^"]+)"~i', $rawHtml, $m)) {
$tags[] = ['image', $m[1]];
}
}
// categories → "t" tags
if (!empty($item['categories']) && is_array($item['categories'])) {
foreach ($item['categories'] as $category) {
$cat = trim((string)$category);
if ($cat !== '') {
$event->addTag(['t', $cat]);
}
}
}
$event->setTags($tags);
// Sign event
$signer = new Sign();
$signer->signEvent($event, $bot->getNsec());
// Publish (add/adjust relays as you like)
try {
$this->nostrClient->publishEvent($event, [
'wss://purplepag.es',
'wss://relay.damus.io',
'wss://nos.lol',
]);
$io->writeln('Published long-form event: ' . ($item['title'] ?? '(no title)'));
} catch (\Throwable $e) {
$io->warning('Publish failed: ' . $e->getMessage());
}
// Persist locally
$article = $this->factory->createFromLongFormContentEvent((object)$event->toArray());
$this->entityManager->persist($article);
} catch (\Throwable $e) {
// keep going on item errors
$io->warning('Item failed: ' . ($item['title'] ?? '(no title)') . ' — ' . $e->getMessage());
}
}
$this->entityManager->flush();
$io->success('RSS fetch complete for: ' . $nzine->getFeedUrl());
// --- Update bot profile (kind 0) using feed metadata ---
$feedMeta = $feed['feed'] ?? null;
if ($feedMeta) {
$profile = [
'name' => $feedMeta['title'] ?? $nzine->getTitle(),
'about' => $feedMeta['description'] ?? '',
'picture' => $feedMeta['image'] ?? null,
'website' => $feedMeta['link'] ?? null,
];
$p = new Event();
$p->setKind(0);
$p->setCreatedAt(time());
$p->setContent(json_encode($profile, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE));
$signer = new Sign();
$signer->signEvent($p, $bot->getNsec());
try {
$this->nostrClient->publishEvent($p, ['wss://purplepag.es']);
$io->success('Published bot profile (kind 0) with feed metadata');
} catch (\Throwable $e) {
$io->warning('Failed to publish bot profile event: ' . $e->getMessage());
}
}
}
return Command::SUCCESS;
}
/** -------- Helpers: HTML prep + converter + small utils -------- */
private function normalizeWeirdHtml(string $html): string
{
// 1) Unwrap Ghost "HTML cards": keep only the <body> content, drop <html>/<head> wrappers and scripts
$html = preg_replace_callback('/<!--\s*kg-card-begin:\s*html\s*-->.*?<!--\s*kg-card-end:\s*html\s*-->/si', function ($m) {
$block = $m[0];
// Extract inner <body>…</body> if present
if (preg_match('/<body\b[^>]*>(.*?)<\/body>/si', $block, $mm)) {
$inner = $mm[1];
} else {
// No explicit body; just strip the markers
$inner = preg_replace('/<!--\s*kg-card-(?:begin|end):\s*html\s*-->/', '', $block);
}
return $inner;
}, $html);
// 2) Nuke any remaining document wrappers that would cut DOM parsing short
$html = preg_replace([
'/<\/?html[^>]*>/i',
'/<\/?body[^>]*>/i',
'/<head\b[^>]*>.*?<\/head>/si',
], '', $html);
dump($html);
return $html;
}
private function sanitizeHtml(string $html): string
{
if ($html === '') return $html;
// 0) quick pre-clean: kill scripts/styles early to avoid DOM bloat
$html = preg_replace('~<(script|style)\b[^>]*>.*?</\1>~is', '', $html);
$html = preg_replace('~<!--.*?-->~s', '', $html); // comments
// 1) Normalize weird widgets and wrappers BEFORE DOM parse
// lightning-widget → simple text
$html = preg_replace_callback(
'~<lightning-widget[^>]*\bto="([^"]+)"[^>]*>.*?</lightning-widget>~is',
fn($m) => '<p>⚡ Tips: ' . htmlspecialchars($m[1]) . '</p>',
$html
);
// Ghost/Koenig wrappers: keep useful inner content
$html = preg_replace('~<figure[^>]*\bkg-image-card\b[^>]*>\s*(<img[^>]+>)\s*</figure>~i', '$1', $html);
$html = preg_replace('~<div[^>]*\bkg-callout-card\b[^>]*>(.*?)</div>~is', '<blockquote>$1</blockquote>', $html);
// YouTube iframes → links
$html = preg_replace_callback(
'~<iframe[^>]+src="https?://www\.youtube\.com/embed/([A-Za-z0-9_\-]+)[^"]*"[^>]*></iframe>~i',
fn($m) => '<p><a href="https://youtu.be/' . $m[1] . '">Watch on YouTube</a></p>',
$html
);
// 2) Try to pretty up malformed markup via Tidy (if available)
if (function_exists('tidy_parse_string')) {
try {
$tidy = tidy_parse_string($html, [
'clean' => true,
'output-xhtml' => true,
'show-body-only' => false,
'wrap' => 0,
'drop-empty-paras' => true,
'merge-divs' => true,
'merge-spans' => true,
'numeric-entities' => false,
'quote-ampersand' => true,
], 'utf8');
$tidy->cleanRepair();
$html = (string)$tidy;
} catch (\Throwable $e) {
// ignore tidy failures
}
}
// 3) DOM sanitize: remove junk, unwrap html/body/head, allowlist elements/attrs
$dom = new \DOMDocument('1.0', 'UTF-8');
libxml_use_internal_errors(true);
$loaded = $dom->loadHTML(
// force UTF-8 meta so DOMDocument doesn't mangle
'<!DOCTYPE html><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'.$html,
LIBXML_NOWARNING | LIBXML_NOERROR
);
libxml_clear_errors();
if (!$loaded) {
// fallback: as-is minus tags we already stripped
return $html;
}
$xpath = new \DOMXPath($dom);
// Remove <head>, <script>, <style>, <link>, <meta>, <noscript>, <object>, <embed>
foreach (['//head','//script','//style','//link','//meta','//noscript','//object','//embed'] as $q) {
foreach ($xpath->query($q) as $n) {
$n->parentNode?->removeChild($n);
}
}
// Remove iframes that survived (non-YouTube or any at this point)
foreach ($xpath->query('//iframe') as $n) {
$n->parentNode?->removeChild($n);
}
// Remove any custom elements we don’t want (e.g., <lightning-widget>, <amp-*>)
foreach ($xpath->query('//*[starts-with(name(), "amp-") or local-name()="lightning-widget"]') as $n) {
$n->parentNode?->removeChild($n);
}
// Allowlist basic attributes; drop event handlers/javascript: urls
$allowedAttrs = ['href','src','alt','title','width','height','class'];
foreach ($xpath->query('//@*') as $attr) {
$name = $attr->nodeName;
$val = $attr->nodeValue ?? '';
if (!in_array($name, $allowedAttrs, true)) {
$attr->ownerElement?->removeAttributeNode($attr);
continue;
}
// kill javascript: and data: except images
if ($name === 'href' || $name === 'src') {
$valTrim = trim($val);
$lower = strtolower($valTrim);
$isDataImg = str_starts_with($lower, 'data:image/');
if (str_starts_with($lower, 'javascript:') || (str_starts_with($lower, 'data:') && !$isDataImg)) {
$attr->ownerElement?->removeAttribute($name);
} else {
$attr->nodeValue = $valTrim;
}
}
}
// Unwrap <html> and <body> → gather innerHTML
$body = $dom->getElementsByTagName('body')->item(0);
$container = $body ?: $dom; // fallback
// Drop empty spans/divs that are just whitespace
foreach ($xpath->query('.//span|.//div', $container) as $n) {
if (!trim($n->textContent ?? '') && !$n->getElementsByTagName('*')->length) {
$n->parentNode?->removeChild($n);
}
}
// Serialize inner HTML of container
$cleanHtml = '';
foreach ($container->childNodes as $child) {
$cleanHtml .= $dom->saveHTML($child);
}
// Final tiny cleanups
$cleanHtml = preg_replace('~\s+</p>~', '</p>', $cleanHtml);
$cleanHtml = preg_replace('~<p>\s+</p>~', '', $cleanHtml);
return trim($cleanHtml);
}
private function htmlToMarkdown(string $html): string
{
$converter = $this->makeConverter();
$md = trim($converter->convert($html));
// ensure there's a blank line after images
// 1) images that already sit alone on a line
$md = preg_replace('/^(>?\s*)!\[[^\]]*]\([^)]*\)\s*$/m', "$0\n", $md);
// 2) inline images: add a newline after the token (optional — comment out if you only want #1)
$md = preg_replace('/!\[[^\]]*]\([^)]*\)/', "$0\n", $md);
// collapse any excessive blank lines to max two
$md = preg_replace("/\n{3,}/", "\n\n", $md);
// Optional: coalesce too many blank lines caused by sanitization/conversion
$md = preg_replace("~\n{3,}~", "\n\n", $md);
return $md;
}
private function makeConverter(): HtmlConverter
{
return new HtmlConverter([
'header_style' => 'atx',
'bold_style' => '**',
'italic_style' => '*',
'hard_break' => true,
'strip_tags' => true,
'remove_nodes' => 'script style',
]);
}
private function plainText(string $html): string
{
return trim(html_entity_decode(strip_tags($html)));
}
private function ellipsis(string $text, int $max): string
{
$text = trim($text);
if ($text === '' || mb_strlen($text) <= $max) return $text;
return rtrim(mb_substr($text, 0, $max - 1)) . '…';
}
private function safeStr(?string $s): string
{
return $s === null ? '' : trim($s);
}
}