Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /**
* Extract News Article Metadata to JSON Database
*
* Parses all news article HTML files and extracts Schema.org JSON-LD
* metadata into a single data/news-articles.json file.
*
* Usage: node --experimental-strip-types scripts/extract-news-metadata.ts
*
* @module scripts/extract-news-metadata
*/
import { readFileSync, writeFileSync, readdirSync, mkdirSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const ROOT = join(__dirname, '..');
interface ArticleMetadata {
slug: string;
file: string;
lang: string;
headline: string;
description: string;
datePublished: string;
dateModified: string;
articleSection: string;
wordCount: number;
inLanguage: string;
keywords: string;
image: string;
url: string;
}
interface NewsDatabase {
version: string;
generatedAt: string;
totalArticles: number;
uniqueSlugs: number;
languages: string[];
articles: ArticleMetadata[];
}
interface JsonLdArticle {
'@type'?: string;
headline?: string;
description?: string;
datePublished?: string;
dateModified?: string;
articleSection?: string;
wordCount?: number;
inLanguage?: string;
keywords?: string;
mainEntityOfPage?: { '@id'?: string };
url?: string;
}
function extractMetadata(): void {
const newsDir = join(ROOT, 'news');
const files = readdirSync(newsDir)
.filter((f: string) => f.endsWith('.html') && !f.startsWith('index'));
const articles: ArticleMetadata[] = [];
for (const file of files) {
const content = readFileSync(join(newsDir, file), 'utf-8');
// Extract JSON-LD blocks
const jsonLdRegex = /<script type="application\/ld\+json">([\s\S]*?)<\/script>/g;
let match: RegExpExecArray | null;
let articleData: JsonLdArticle | null = null;
while ((match = jsonLdRegex.exec(content)) !== null) {
try {
const parsed = JSON.parse(match[1]!) as JsonLdArticle;
if (parsed['@type'] === 'NewsArticle') {
articleData = parsed;
break;
}
} catch {
// Skip malformed JSON-LD
}
}
if (!articleData) {
console.warn(`WARN: No NewsArticle JSON-LD in ${file}`);
continue;
}
// Extract Open Graph image
const ogImageMatch = content.match(/property="og:image" content="([^"]+)"/);
const ogImage: string = ogImageMatch?.[1] ?? '';
// Extract language from filename
const langMatch = file.match(/-([a-z]{2})\.html$/);
const lang: string = langMatch?.[1] ?? 'en';
// Extract slug (filename without language suffix)
const slug = file.replace(/-[a-z]{2}\.html$/, '');
articles.push({
slug,
file,
lang,
headline: articleData.headline ?? '',
description: articleData.description ?? '',
datePublished: articleData.datePublished ?? '',
dateModified: articleData.dateModified ?? '',
articleSection: articleData.articleSection ?? '',
wordCount: articleData.wordCount || (() => {
const stripped = content.replace(/<[^>]+>/g, ' ');
return stripped.split(/\s+/).filter((w: string) => w.length > 0).length;
})(),
inLanguage: articleData.inLanguage ?? lang as string,
keywords: articleData.keywords ?? '',
image: ogImage as string,
url: articleData.mainEntityOfPage?.['@id'] ?? '',
});
}
// Sort by date descending, then by language
articles.sort((a, b) => {
const dateCompare = b.datePublished.localeCompare(a.datePublished);
if (dateCompare !== 0) return dateCompare;
return a.lang.localeCompare(b.lang);
});
const db: NewsDatabase = {
version: '1.0.0',
generatedAt: new Date().toISOString(),
totalArticles: articles.length,
uniqueSlugs: [...new Set(articles.map((a) => a.slug))].length,
languages: [...new Set(articles.map((a) => a.lang))].sort(),
articles,
};
mkdirSync(join(ROOT, 'data'), { recursive: true });
writeFileSync(join(ROOT, 'data', 'news-articles.json'), JSON.stringify(db, null, 2));
console.log('Generated data/news-articles.json:');
console.log(' Total articles:', db.totalArticles);
console.log(' Unique slugs:', db.uniqueSlugs);
console.log(' Languages:', db.languages.join(', '));
console.log(' File size:', (JSON.stringify(db).length / 1024).toFixed(1), 'KB');
}
extractMetadata();
|