Skip to content

Commit cf491d2

Browse files
committed
poc: generate markdown copies of each HTML file for agents
1 parent 6f14ac5 commit cf491d2

File tree

4 files changed

+408
-15
lines changed

4 files changed

+408
-15
lines changed

data/onPostBuild/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import { GatsbyNode } from 'gatsby';
22
import { onPostBuild as llmstxt } from './llmstxt';
33
import { onPostBuild as compressAssets } from './compressAssets';
4+
import { onPostBuild as markdownOutput } from './markdownOutput';
45

56
export const onPostBuild: GatsbyNode['onPostBuild'] = async (args) => {
67
// Run all onPostBuild functions in sequence
78
await llmstxt(args);
9+
await markdownOutput(args);
810
await compressAssets(args);
911
};

data/onPostBuild/markdownOutput.ts

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
import { GatsbyNode, Reporter } from 'gatsby';
2+
import fs from 'fs/promises';
3+
import path from 'path';
4+
import { glob } from 'glob';
5+
import { JSDOM, VirtualConsole } from 'jsdom';
6+
import TurndownService from 'turndown';
7+
8+
const CONFIG = {
9+
htmlDir: './public',
10+
markdownDir: './public',
11+
excludePatterns: ['404.html', 'api/**/*', 'page-data/**/*', 'static/**/*', 'docs/404.html'],
12+
includeMetadata: true,
13+
};
14+
15+
// Selectors for elements to remove from the HTML before converting to markdown
16+
const UNWANTED_ELEMENTS_SELECTOR =
17+
'script, style, nav[role="navigation"], .header, #header, header, .footer, #footer, footer, [aria-label="breadcrumb"], aside';
18+
19+
// Prioritised selectors for the main content of the page, first match wins
20+
const CONTENT_SELECTORS = ['main', '[role="main"]', '.content', '#content', 'article'];
21+
22+
const withoutTrailingSlash = (path: string) => (path === `/` ? path : path.replace(/\/$/, ``));
23+
24+
const cleanAttribute = (attribute: string | null) => {
25+
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
26+
};
27+
28+
async function exportToMarkdown({ reporter, siteUrl }: { reporter: Reporter; siteUrl: string }) {
29+
const turndownService = new TurndownService({
30+
headingStyle: 'atx',
31+
codeBlockStyle: 'fenced',
32+
emDelimiter: '*',
33+
});
34+
35+
// Remove the anchor tags from the headers
36+
turndownService.addRule('header', {
37+
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
38+
replacement: (_, node) => {
39+
const level = parseInt(node.nodeName.charAt(1), 10);
40+
return `${'#'.repeat(level)} ${node.textContent}`;
41+
},
42+
});
43+
44+
// Update local links to use the siteUrl
45+
turndownService.addRule('localLink', {
46+
filter: (node) => (node.nodeName === 'A' && node.getAttribute('href')?.startsWith('/')) || false,
47+
replacement: (content, node) => {
48+
// most of this replacement is taken from the turndown library directly
49+
let href = withoutTrailingSlash(siteUrl) + (node as HTMLElement).getAttribute('href');
50+
if (href) {
51+
href = href.replace(/([()])/g, '\\$1');
52+
}
53+
let title = cleanAttribute((node as HTMLElement).getAttribute('title'));
54+
if (title) {
55+
title = ' "' + title.replace(/"/g, '\\"') + '"';
56+
}
57+
return '[' + content + '](' + href + title + ')';
58+
},
59+
});
60+
61+
// Find all HTML files
62+
const htmlFiles = await glob('**/*.html', {
63+
cwd: CONFIG.htmlDir,
64+
ignore: CONFIG.excludePatterns,
65+
});
66+
67+
reporter.info(`Found ${htmlFiles.length} HTML files to process`);
68+
69+
for (const htmlFile of htmlFiles) {
70+
try {
71+
const fullPath = path.join(CONFIG.htmlDir, htmlFile);
72+
const htmlContent = await fs.readFile(fullPath, 'utf-8');
73+
74+
// Parse and clean HTML
75+
const virtualConsole = new VirtualConsole(); // Stop CSS parsing errors from polluting the console
76+
const dom = new JSDOM(htmlContent, { url: siteUrl, virtualConsole });
77+
const document = dom.window.document;
78+
79+
// Remove unwanted elements
80+
const unwanted = document.querySelectorAll(UNWANTED_ELEMENTS_SELECTOR);
81+
unwanted.forEach((el) => el.remove());
82+
83+
// Get main content
84+
let mainContent = null;
85+
86+
for (const selector of CONTENT_SELECTORS) {
87+
mainContent = document.querySelector(selector);
88+
if (mainContent) {
89+
break;
90+
}
91+
}
92+
93+
if (!mainContent) {
94+
mainContent = document.body;
95+
}
96+
97+
// Convert to markdown
98+
const markdown = turndownService.turndown(mainContent.innerHTML);
99+
100+
// Prepare final content
101+
let finalContent = '';
102+
103+
if (CONFIG.includeMetadata) {
104+
const title = document.querySelector('title')?.textContent?.trim() || 'Untitled';
105+
const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || '';
106+
const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || '';
107+
108+
finalContent = `---
109+
title: "${title}"
110+
url: ${canonicalUrl || `/${htmlFile.replace('.html', '').replace('/index', '')}`}
111+
generated_at: ${new Date().toISOString()}
112+
description: "${description}"
113+
---
114+
115+
${markdown}`;
116+
} else {
117+
finalContent = markdown;
118+
}
119+
120+
// Append .md to the filename, remove /index.html
121+
const outputName = `${htmlFile.replace('/index.html', '')}.md`;
122+
const outputPath = path.join(CONFIG.markdownDir, outputName);
123+
124+
// Write markdown file
125+
await fs.writeFile(outputPath, finalContent);
126+
} catch (error) {
127+
reporter.error(`✗ Error processing ${htmlFile}:`, error as Error);
128+
}
129+
}
130+
131+
reporter.info(`Markdown export complete! ${htmlFiles.length} files processed.`);
132+
}
133+
134+
interface QueryResult {
135+
site: {
136+
siteMetadata: {
137+
siteUrl: string;
138+
};
139+
};
140+
}
141+
142+
// Run the export
143+
export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => {
144+
const query = `
145+
query {
146+
site {
147+
siteMetadata {
148+
siteUrl
149+
}
150+
}
151+
}
152+
`;
153+
const { data, errors } = await graphql<QueryResult>(query);
154+
155+
if (errors) {
156+
reporter.panicOnBuild(`Error while running GraphQL query.`);
157+
throw errors;
158+
}
159+
160+
if (!data) {
161+
reporter.panicOnBuild(`No documents found.`);
162+
throw new Error('No documents found.');
163+
}
164+
165+
const siteUrl = data.site.siteMetadata.siteUrl;
166+
167+
if (!siteUrl) {
168+
reporter.panicOnBuild(`Site URL not found.`);
169+
throw new Error('Site URL not found.');
170+
}
171+
172+
await exportToMarkdown({ reporter, siteUrl });
173+
};

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@
9090
"react-select": "^5.7.0",
9191
"remark-gfm": "^1.0.0",
9292
"textile-js": "^2.1.1",
93-
"turndown": "^7.1.1",
93+
"turndown": "^7.2.0",
9494
"typescript": "^4.6.3",
9595
"use-keyboard-shortcut": "^1.1.6",
9696
"util": "^0.12.4",
@@ -131,10 +131,12 @@
131131
"eslint-plugin-react-hooks": "^4.6.0",
132132
"fast-check": "^3.4.0",
133133
"gatsby-plugin-postcss": "^6.3.0",
134+
"glob": "^11.0.2",
134135
"identity-obj-proxy": "^3.0.0",
135136
"jest": "^29.3.1",
136137
"jest-axe": "^7.0.0",
137138
"jest-environment-jsdom": "^29.3.1",
139+
"jsdom": "^26.1.0",
138140
"lint-staged": "^13.1.0",
139141
"msw": "^2.0.1",
140142
"postcss": "^8.4.31",

0 commit comments

Comments
 (0)