helpers slug.ts

83.72% Statements 72/86
89.28% Branches 125/140
100% Functions 7/7
84.61% Lines 55/65
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5x
 
 
5x
 
5x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49242x
49242x
49172x
49172x
49172x
29679x
24732x
9290x
 
19493x
19493x
23178x
23178x
23178x
3975x
10x
 
 
 
 
 
15588x
 
 
 
 
 
 
 
 
 
 
 
 
 
15588x
 
15588x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60731x
15546x
 
 
42x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42x
 
1745x
5x
 
 
37x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1242x
4x
 
 
33x
 
 
 
 
 
 
 
30454x
30454x
 
 
30454x
17996x
17996x
17996x
17996x
17996x
17996x
17996x
17996x
 
 
30454x
30454x
213178x
 
 
30454x
 
 
 
 
 
 
30454x
30454x
 
 
30454x
17996x
 
 
30454x
18788x
18788x
18788x
 
 
86586x
 
  /**
 * @module generate-news-indexes/helpers/slug
 * @description Slug-derived classification and topic/tag extraction.
 * Recognises subfolder slugs against the article-types registry first and
 * falls back to multi-language keyword detection for legacy articles.
 *
 * The language-suffix regular expression is built from the canonical
 * `LANGUAGES` constant so this module is the single source of truth for
 * the 14 supported language codes used by the news-index renderer.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import type { ArticleTypeValue } from '../types.js';
import { LANGUAGES } from '../constants.js';
import { getBySubfolder } from '../../render-lib/article-types.js';
 
/** Language codes derived from LANGUAGES constant — single source of truth. */
const LANG_CODES = Object.keys(LANGUAGES).join('|');
 
/** Language-suffix pattern shared with `article-merge.ts`. */
export const LANG_SUFFIX_RE: RegExp = new RegExp(`-(${LANG_CODES})\\.html$`);
 
const TOPIC_INFERENCE_PATTERNS: ReadonlyArray<readonly [string, RegExp]> = [
  ['committees', /(committee|committeereports|utskott|betänkande|committee reports|rapport de commission|ausschuss|위원회|委员会|委員会)/i],
  ['legislation', /(proposition|motion|bill|legislative|lagstift|lovgiv|lainsäädäntö|gesetz|législation|legislación|立法|입법)/i],
  ['parliament', /(interpellation|riksdag|parliament|riksdagen|parlament|val\b|election|voting|vote|议会|의회|議会)/i],
  ['government', /(government|regering|regerings|minister|ministry|kristersson|gouvernement|gobierno|regierung|الحكومة|ממשלה|政府)/i],
  ['defense', /(defen[cs]e|försvar|forsvar|puolustus|verteidigung|défense|defensa|defensie|国防|防衛|국방)/i],
  ['environment', /(environment|climate|miljö|miljø|ympäristö|umwelt|environnement|medio ambiente|milieu|环境|環境|환경)/i],
  ['eu', /(eu|european union|europeiska unionen|union européenne|europäische union)/i],
] as const;
 
/**
 * Classify article type based on content and filename.
 * Uses the article-types registry first, then falls back to keyword detection
 * for legacy articles. Supports detection keywords in all 14 languages.
 * When `relativePath` is provided, also checks the parent directory name
 * against the registry (for subdirectory-based articles like election-cycle/).
 */
export function classifyArticleType(content: string, fileName: string, relativePath?: string): ArticleTypeValue {
  const slugMatch = fileName.match(new RegExp(`^\\d{4}-\\d{2}-\\d{2}-(.+?)-(${LANG_CODES})\\.html$`));
  if (slugMatch) {
    const slug = slugMatch[1]!;
    const entry = getBySubfolder(slug);
    if (entry) {
      if (entry.family === 'long-horizon-forecast') return 'prospective';
      if (entry.family === 'single-type') return 'analysis';
      Eif (entry.family === 'tier-c-aggregation') return 'retrospective';
    }
    const parts = slug.split('-');
    for (let i = parts.length - 1; i >= 1; i--) {
      const prefix = parts.slice(0, i).join('-');
      const prefixEntry = getBySubfolder(prefix);
      if (prefixEntry) {
        if (prefixEntry.family === 'long-horizon-forecast') return 'prospective';
        Eif (prefixEntry.family === 'single-type') return 'analysis';
        if (prefixEntry.family === 'tier-c-aggregation') return 'retrospective';
      }
    }
  }
 
  Iif (relativePath && relativePath.includes('/')) {
    const parentDir = relativePath.split('/')[0]!;
    const dirSlugMatch = parentDir.match(/^\d{4}-\d{2}-\d{2}-(.+)$/);
    if (dirSlugMatch) {
      const dirSlug = dirSlugMatch[1]!;
      const entry = getBySubfolder(dirSlug);
      if (entry) {
        if (entry.family === 'long-horizon-forecast') return 'prospective';
        if (entry.family === 'single-type') return 'analysis';
        if (entry.family === 'tier-c-aggregation') return 'retrospective';
      }
    }
  }
 
  const lowerContent: string = content.toLowerCase();
 
  const prospectiveKeywords: string[] = [
    'week ahead', 'week-ahead', 'upcoming', 'preview', 'look ahead',           // en
    'veckan som kommer', 'kommande vecka', 'framåtblick',                       // sv
    'ugen der kommer', 'kommende uge', 'fremadrettet',                          // da
    'uken som kommer', 'fremtidsrettet',                                        // no
    'tuleva viikko', 'ennakko',                                                 // fi
    'woche voraus', 'vorschau',                                                 // de
    'semaine à venir', 'aperçu',                                                // fr
    'semana por delante', 'adelanto',                                            // es
    'week vooruit', 'vooruitblik',                                               // nl
    'الأسبوع المقبل', 'القادم',                                                  // ar
    'השבוע הבא', 'הקרוב',                                                       // he
    '来週の展望', '今後',                                                          // ja
    '주간 전망', '다가오는',                                                       // ko
    '一周展望', '即将'
  ];
 
  if (fileName.includes('week-ahead') || fileName.includes('month-ahead') || prospectiveKeywords.some((kw) => lowerContent.includes(kw.toLowerCase()))) {
    return 'prospective';
  }
 
  const analysisKeywords: string[] = [
    'committee reports', 'analysis', 'review', 'assessment',                     // en
    'utskottsbetänkanden', 'analys', 'granskning', 'betänkande',                // sv
    'udvalgsrapporter', 'analyse', 'gennemgang', 'udvalgsbetænkning',           // da
    'komitérapporter', 'gjennomgang', 'komitéinnstilling',                      // no
    'valiokuntaraportit', 'analyysi', 'katsaus', 'valiokunnan mietintö',        // fi
    'ausschussberichte', 'überprüfung', 'ausschussbericht',                     // de
    'rapports de commission', 'examen', 'rapport de commission',                 // fr
    'informes de comité', 'análisis', 'revisión', 'informe de comité',          // es
    'commissierapporten', 'beoordeling', 'commissieverslag',                     // nl
    'تقارير اللجان', 'تحليل', 'تقرير اللجنة',                                  // ar
    'דוחות ועדות', 'ניתוח', 'דוח ועדה',                                         // he
    '委員会報告', '分析',                                                          // ja
    '위원회 보고서', '분석',                                                       // ko
    '委员会报告', '分析'
  ];
 
  if (fileName.includes('committee-reports') || fileName.includes('propositions') || fileName.includes('motions') ||
      fileName.includes('deep-inspection') ||
      analysisKeywords.some((kw) => lowerContent.includes(kw.toLowerCase()))) {
    return 'analysis';
  }
 
  const breakingKeywords: string[] = [
    'breaking', 'urgent', 'alert', 'flash',                                      // en
    'senaste nytt', 'akut', 'brådskande',                                        // sv
    'seneste nyt', 'hastesag',                                                   // da
    'siste nytt', 'haster',                                                      // no
    'viimeisimmät', 'kiireellinen', 'hälytys',                                   // fi
    'eilmeldungen', 'dringend', 'alarm',                                         // de
    'dernières nouvelles', 'alerte',                                              // fr
    'última hora', 'urgente', 'alerta',                                           // es
    'laatste nieuws', 'alert',                                                    // nl
    'أخبار عاجلة', 'عاجل',                                                       // ar
    'חדשות אחרונות', 'דחוף',                                                     // he
    '速報', '緊急',                                                               // ja
    '속보', '긴급',                                                               // ko
    '突发新闻', '紧急'
  ];
 
  if (fileName.includes('breaking') || breakingKeywords.some((kw) => lowerContent.includes(kw.toLowerCase()))) {
    return 'breaking';
  }
 
  return 'retrospective';
}
 
/**
 * Extract topics from article tags.
 * Supports topic detection keywords in all 14 languages.
 */
export function extractTopics(content: string, fileName: string = ''): string[] {
  const topics: string[] = [];
  const tagPattern = /<meta\s+property=["']article:tag["']\s+content=["']([^"']+)["']/gi;
  let match: RegExpExecArray | null;
 
  while ((match = tagPattern.exec(content)) !== null) {
    const tag: string = match[1]!.toLowerCase();
    if (tag.includes('eu')) topics.push('eu');
    if (tag.includes('parliament') || tag.includes('riksdag') || tag.includes('parlamentet') || tag.includes('議会') || tag.includes('의회') || tag.includes('议会') || tag.includes('البرلمان') || tag.includes('פרלמנט')) topics.push('parliament');
    if (tag.includes('government') || tag.includes('regering') || tag.includes('regjeringen') || tag.includes('hallitus') || tag.includes('regierung') || tag.includes('gouvernement') || tag.includes('gobierno') || tag.includes('政府') || tag.includes('정부') || tag.includes('الحكومة') || tag.includes('ממשלה')) topics.push('government');
    if (tag.includes('defense') || tag.includes('defence') || tag.includes('försvar') || tag.includes('forsvar') || tag.includes('puolustus') || tag.includes('verteidigung') || tag.includes('défense') || tag.includes('defensa') || tag.includes('defensie') || tag.includes('الدفاع') || tag.includes('הגנה') || tag.includes('防衛') || tag.includes('국방') || tag.includes('国防')) topics.push('defense');
    if (tag.includes('environment') || tag.includes('miljö') || tag.includes('miljø') || tag.includes('ympäristö') || tag.includes('umwelt') || tag.includes('environnement') || tag.includes('medio ambiente') || tag.includes('milieu') || tag.includes('البيئة') || tag.includes('סביבה') || tag.includes('環境') || tag.includes('환경') || tag.includes('环境')) topics.push('environment');
    if (tag.includes('committee') || tag.includes('utskott') || tag.includes('udvalg') || tag.includes('utvalg') || tag.includes('valiokunt') || tag.includes('ausschuss') || tag.includes('commission') || tag.includes('comité') || tag.includes('commissie') || tag.includes('لجنة') || tag.includes('ועדה') || tag.includes('委員会') || tag.includes('위원회') || tag.includes('委员会')) topics.push('committees');
    if (tag.includes('legislation') || tag.includes('lagstiftning') || tag.includes('lovgivning') || tag.includes('lainsäädäntö') || tag.includes('gesetzgebung') || tag.includes('législation') || tag.includes('legislación') || tag.includes('wetgeving') || tag.includes('التشريعات') || tag.includes('חקיקה') || tag.includes('立法') || tag.includes('입법')) topics.push('legislation');
  }
 
  const sourceSample = `${fileName} ${content.slice(0, 2500)}`;
  for (const [topic, pattern] of TOPIC_INFERENCE_PATTERNS) {
    if (pattern.test(sourceSample)) topics.push(topic);
  }
 
  return [...new Set(topics)].slice(0, 5);
}
 
/**
 * Extract tags from article:tag meta tags.
 */
export function extractTags(content: string, fileName: string = '', inferredTopics?: string[]): string[] {
  const tags: string[] = [];
  const tagPattern = /<meta\s+property=["']article:tag["']\s+content=["']([^"']+)["']/gi;
  let match: RegExpExecArray | null;
 
  while ((match = tagPattern.exec(content)) !== null) {
    tags.push(match[1]!);
  }
 
  if (tags.length === 0) {
    inferredTopics ??= extractTopics(content, fileName);
    const inferredType = classifyArticleType(content, fileName);
    tags.push(inferredType, ...inferredTopics);
  }
 
  return [...new Set(tags.filter((tag) => tag.trim().length > 0))].slice(0, 4);
}