8f9c1d64b8
* verify-links: catch broken anchors, directory links, and invisible characters Enhances the link verification script to catch several categories of broken links that were previously missed: - Broken anchors (cross-file and same-file) by extracting heading slugs from target documents and verifying anchors resolve - Directory links missing index.md suffix within docs/ - Invisible/zero-width characters in URLs - Case-sensitive anchor mismatches Also strips fenced code blocks before scanning for links to avoid false positives, and handles duplicate heading slug deduplication (GitHub and Docusaurus append -1, -2, etc.). Fixes a few newly-caught broken links in existing docs. Signed-off-by: Patrik Oldsberg <poldsberg@gmail.com> Made-with: Cursor * Fix CodeQL incomplete multi-character sanitization alert Apply HTML tag stripping in a loop so that nested fragments like <scr<script>ipt> are fully removed. Signed-off-by: Patrik Oldsberg <poldsberg@gmail.com> Made-with: Cursor --------- Signed-off-by: Patrik Oldsberg <poldsberg@gmail.com>
438 lines
14 KiB
JavaScript
Executable File
438 lines
14 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
/*
|
|
* Copyright 2020 The Backstage Authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/* eslint-disable @backstage/no-undeclared-imports */
|
|
|
|
const { resolve: resolvePath, join: joinPath, dirname } = require('node:path');
|
|
const fs = require('node:fs').promises;
|
|
const { existsSync, statSync } = require('node:fs');
|
|
|
|
const IGNORED_DIRS = ['node_modules', 'dist', 'bin', '.git'];
|
|
const projectRoot = resolvePath(__dirname, '..');
|
|
|
|
// Zero-width and other invisible Unicode characters that shouldn't appear in URLs
|
|
const INVISIBLE_CHAR_PATTERN =
|
|
/[\u200B\u200C\u200D\u200E\u200F\uFEFF\u00AD\u2060\u2028\u2029]/;
|
|
|
|
// Generates a GitHub/Docusaurus-compatible heading slug.
|
|
// Handles explicit {#custom-id} overrides and standard slugification.
|
|
function headingToSlug(headingText) {
|
|
const explicitId = headingText.match(/\{#([^}]+)\}\s*$/);
|
|
if (explicitId) {
|
|
return explicitId[1];
|
|
}
|
|
|
|
let slug = headingText
|
|
.toLowerCase()
|
|
// Remove inline code backticks
|
|
.replace(/`/g, '')
|
|
// Remove markdown bold/italic markers
|
|
.replace(/[*_]/g, '')
|
|
// Remove markdown links, keep link text
|
|
.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
|
|
// Remove HTML tags in a loop to handle nested fragments like <scr<script>ipt>
|
|
let previous;
|
|
do {
|
|
previous = slug;
|
|
slug = slug.replace(/<[^>]+>/g, '');
|
|
} while (slug !== previous);
|
|
|
|
return (
|
|
slug
|
|
// Replace special characters with hyphens (keeping alphanumeric, hyphens, spaces)
|
|
.replace(/[^\w\s-]/g, '')
|
|
.trim()
|
|
.replace(/\s+/g, '-')
|
|
);
|
|
}
|
|
|
|
// Extracts all heading anchors from a markdown file's content,
|
|
// handling duplicate headings with -1, -2, etc. suffixes (GitHub/Docusaurus behavior)
|
|
function extractHeadingAnchors(content) {
|
|
const anchors = new Set();
|
|
const slugCounts = new Map();
|
|
|
|
// Strip fenced code blocks to avoid matching headings inside them
|
|
const stripped = content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, '');
|
|
|
|
const headingPattern = /^#{1,6}\s+(.+)$/gm;
|
|
for (
|
|
let match = headingPattern.exec(stripped);
|
|
match !== null;
|
|
match = headingPattern.exec(stripped)
|
|
) {
|
|
const baseSlug = headingToSlug(match[1]);
|
|
const count = slugCounts.get(baseSlug) || 0;
|
|
slugCounts.set(baseSlug, count + 1);
|
|
|
|
if (count === 0) {
|
|
anchors.add(baseSlug);
|
|
} else {
|
|
anchors.add(`${baseSlug}-${count}`);
|
|
}
|
|
}
|
|
return anchors;
|
|
}
|
|
|
|
// Cache for file content and extracted anchors to avoid repeated reads
|
|
const anchorCache = new Map();
|
|
|
|
async function getAnchorsForFile(filePath) {
|
|
const absPath = resolvePath(projectRoot, filePath);
|
|
if (anchorCache.has(absPath)) {
|
|
return anchorCache.get(absPath);
|
|
}
|
|
try {
|
|
const content = await fs.readFile(absPath, 'utf8');
|
|
const anchors = extractHeadingAnchors(content);
|
|
anchorCache.set(absPath, anchors);
|
|
return anchors;
|
|
} catch {
|
|
anchorCache.set(absPath, null);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function listFiles(dir) {
|
|
const files = await fs.readdir(dir);
|
|
const paths = await Promise.all(
|
|
files
|
|
.filter(file => !IGNORED_DIRS.includes(file))
|
|
.map(async file => {
|
|
const path = joinPath(dir, file);
|
|
|
|
if ((await fs.stat(path)).isDirectory()) {
|
|
return listFiles(path);
|
|
}
|
|
return path;
|
|
}),
|
|
);
|
|
return paths.flat();
|
|
}
|
|
|
|
async function verifyUrl(basePath, absUrl, docPages) {
|
|
// Check for invisible/zero-width characters in the URL
|
|
if (INVISIBLE_CHAR_PATTERN.test(absUrl)) {
|
|
return { url: absUrl, basePath, problem: 'invisible-chars' };
|
|
}
|
|
|
|
const anchorMatch = absUrl.match(/#(.+)$/);
|
|
const anchor = anchorMatch ? anchorMatch[1] : undefined;
|
|
const urlWithoutAnchor = absUrl.replace(/#.*$/, '');
|
|
const isGitHubUrl =
|
|
/https:\/\/github.com\/backstage\/backstage\/(tree|blob)\/master/.test(
|
|
urlWithoutAnchor,
|
|
);
|
|
const url = urlWithoutAnchor.replace(
|
|
/https:\/\/github.com\/backstage\/backstage\/(tree|blob)\/master/,
|
|
'',
|
|
);
|
|
|
|
// Avoid having absolute URL links within docs/, so that links work on the site
|
|
if (
|
|
absUrl.match(
|
|
/https:\/\/github.com\/backstage\/backstage\/(tree|blob)\/master\/docs\//,
|
|
) &&
|
|
basePath.match(/^(?:docs|microsite)\//)
|
|
) {
|
|
// Exception for linking to the changelogs, since we encourage those to be browsed in GitHub
|
|
if (absUrl.match(/docs\/releases\/.+-changelog\.md$/)) {
|
|
if (docPages.has(url.slice(0, -'.md'.length))) {
|
|
return undefined;
|
|
}
|
|
return { url: absUrl, basePath, problem: 'missing' };
|
|
}
|
|
|
|
return { url: absUrl, basePath, problem: 'github' };
|
|
}
|
|
|
|
// Same-file anchor reference (e.g. #some-heading)
|
|
if (!url && anchor) {
|
|
const anchors = await getAnchorsForFile(basePath);
|
|
if (anchors && !anchors.has(anchor)) {
|
|
return { url: absUrl, basePath, problem: 'bad-anchor' };
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
if (!url) {
|
|
return undefined;
|
|
}
|
|
|
|
// Only verify existence of local files for now, so skip anything with a schema
|
|
if (url.match(/[a-z]+:/)) {
|
|
return undefined;
|
|
}
|
|
|
|
if (basePath.startsWith('.changeset/')) {
|
|
if (absUrl.match(/^https?:\/\//)) {
|
|
return undefined;
|
|
}
|
|
return { url, basePath, problem: 'out-of-changeset' };
|
|
}
|
|
|
|
let path = '';
|
|
|
|
if (url.startsWith('/')) {
|
|
if (url.startsWith('/docs/')) {
|
|
if (basePath.match(/^(?:docs)\//)) {
|
|
return { url, basePath, problem: 'not-relative' };
|
|
}
|
|
if (basePath.startsWith('microsite/')) {
|
|
if (docPages.has(url)) {
|
|
return undefined;
|
|
}
|
|
return { url, basePath, problem: 'doc-missing' };
|
|
}
|
|
}
|
|
|
|
if (url.startsWith('/api/stable/')) {
|
|
const apiPath = resolvePath(
|
|
projectRoot,
|
|
`type-docs/${url.slice('/api/stable/'.length)}`,
|
|
);
|
|
if (existsSync(apiPath)) {
|
|
return undefined;
|
|
}
|
|
return { url, basePath, apiPath, problem: 'api-missing' };
|
|
}
|
|
|
|
const staticPath = resolvePath(projectRoot, 'microsite/static', `.${url}`);
|
|
if (existsSync(staticPath)) {
|
|
return undefined;
|
|
}
|
|
|
|
path = resolvePath(projectRoot, `.${url}`);
|
|
} else {
|
|
path = resolvePath(dirname(resolvePath(projectRoot, basePath)), url);
|
|
}
|
|
|
|
if (
|
|
absUrl === url &&
|
|
basePath.match(/^(?:docs)\//) &&
|
|
!path.startsWith(resolvePath(projectRoot, 'docs'))
|
|
) {
|
|
return { url, basePath, problem: 'out-of-docs' };
|
|
}
|
|
|
|
if (!existsSync(path)) {
|
|
return { url, basePath, problem: 'missing' };
|
|
}
|
|
|
|
// Flag relative links to directories that are missing /index.md —
|
|
// these resolve as existing dirs but aren't valid doc links.
|
|
// Only check within docs/ since other directories (like microsite/)
|
|
// may legitimately link to directories in READMEs.
|
|
if (
|
|
basePath.match(/^docs\//) &&
|
|
!url.startsWith('/') &&
|
|
existsSync(path) &&
|
|
statSync(path).isDirectory()
|
|
) {
|
|
return { url: absUrl, basePath, problem: 'directory-link' };
|
|
}
|
|
|
|
// Verify anchors in cross-file links, but skip rewritten GitHub URLs
|
|
// since their anchors may reference generated content we can't verify locally
|
|
if (anchor && path.endsWith('.md') && !isGitHubUrl) {
|
|
const targetAnchors = await getAnchorsForFile(
|
|
path.startsWith(projectRoot) ? path.slice(projectRoot.length + 1) : path,
|
|
);
|
|
if (targetAnchors && !targetAnchors.has(anchor)) {
|
|
return { url: absUrl, basePath, problem: 'bad-anchor' };
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
// Strips fenced code blocks from markdown content so we don't check links inside them
|
|
function stripCodeBlocks(content) {
|
|
return content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, '');
|
|
}
|
|
|
|
async function verifyFile(filePath, docPages) {
|
|
const content = await fs.readFile(filePath, 'utf8');
|
|
const strippedContent = stripCodeBlocks(content);
|
|
const mdLinks = strippedContent.match(/\[.+?\]\(.+?\)/g) || [];
|
|
const badUrls = [];
|
|
|
|
for (const mdLink of mdLinks) {
|
|
const url = mdLink.match(/\[.+\]\((.+)\)/)[1].trim();
|
|
const badUrl = await verifyUrl(filePath, url, docPages);
|
|
if (badUrl) {
|
|
badUrls.push(badUrl);
|
|
}
|
|
}
|
|
|
|
const multiLineLinks =
|
|
strippedContent.match(/\[[^\]\n]+?\n[^\]\n]*?(?:\n[^\]\n]*?)?\]\(/g) || [];
|
|
badUrls.push(
|
|
...multiLineLinks.map(url => ({
|
|
url,
|
|
basePath: filePath,
|
|
problem: 'multi-line',
|
|
})),
|
|
);
|
|
|
|
return badUrls;
|
|
}
|
|
|
|
// This discovers the doc paths as they will be available on the microsite.
|
|
// It is used to validate microsite links from outside /docs/, as those
|
|
// are not transformed from the markdown file representation by docusaurus.
|
|
async function findExternalDocsLinks(dir) {
|
|
const allFiles = await listFiles(dir);
|
|
const mdFiles = allFiles.filter(p => p.endsWith('.md'));
|
|
|
|
const paths = new Map();
|
|
|
|
for (const file of mdFiles) {
|
|
const content = await fs.readFile(file, 'utf8');
|
|
const url = `/${file}`;
|
|
const match = content.match(/---(?:\r|\n|.)*^id: (.*)$/m);
|
|
|
|
// Both docs with an id and without should remove trailing /index
|
|
const realPath = (
|
|
match ? joinPath(dirname(url), match[1]) : url.replace(/\.md$/, '')
|
|
).replace(/\/index$/, '');
|
|
|
|
paths.set(url, realPath);
|
|
if (url.endsWith('/index.md')) {
|
|
paths.set(url.replace(/\/index\.md$/, ''), realPath);
|
|
}
|
|
}
|
|
|
|
return paths;
|
|
}
|
|
|
|
async function main() {
|
|
process.chdir(projectRoot);
|
|
|
|
const isCI = Boolean(process.env.CI);
|
|
|
|
const files = await listFiles('.');
|
|
const mdFiles = files.filter(f => f.endsWith('.md'));
|
|
const badUrls = [];
|
|
|
|
const docPages = await findExternalDocsLinks('docs');
|
|
const docPageSet = new Set(docPages.values());
|
|
|
|
for (const mdFile of mdFiles) {
|
|
const badFileUrls = await verifyFile(mdFile, docPageSet);
|
|
badUrls.push(...badFileUrls);
|
|
}
|
|
|
|
const hasReference = existsSync(resolvePath(projectRoot, 'docs/reference'));
|
|
if (!hasReference) {
|
|
console.log(
|
|
"Skipping API reference link validation, no docs/reference/ dir. Reference docs can be built with 'yarn build:api-docs'",
|
|
);
|
|
}
|
|
|
|
const hasApiDocs = existsSync(resolvePath(projectRoot, 'type-docs'));
|
|
if (!hasApiDocs) {
|
|
console.log(
|
|
"Skipping API docs link validation, no type-docs/ dir. API docs can be built with 'yarn backstage-repo-tools package-docs'",
|
|
);
|
|
}
|
|
|
|
if (badUrls.length) {
|
|
console.log(`Found ${badUrls.length} bad links within repo`);
|
|
for (const badUrl of badUrls) {
|
|
const { url, basePath, problem } = badUrl;
|
|
if (problem === 'missing') {
|
|
if (url.startsWith('../reference/') && !isCI && !hasReference) {
|
|
continue;
|
|
}
|
|
console.error(
|
|
`Unable to reach ${url} from root or microsite/static/, linked from ${basePath}`,
|
|
);
|
|
} else if (problem === 'out-of-docs') {
|
|
console.error(
|
|
'Links in docs must use absolute URLs for targets outside of docs',
|
|
);
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
console.error(
|
|
` Likely replace with: https://github.com/backstage/backstage/blob/master/${url.replace(
|
|
/^[./]+/,
|
|
'',
|
|
)}`,
|
|
);
|
|
} else if (problem === 'out-of-changeset') {
|
|
console.error('Links in changesets must use absolute URLs');
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
} else if (problem === 'doc-missing') {
|
|
const suggestion =
|
|
docPages.get(url) ||
|
|
docPages.get(new URL(url, 'http://localhost').pathname);
|
|
console.error('Links into /docs/ must use an externally reachable ID');
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
if (suggestion) {
|
|
console.error(` Replace with: ${suggestion}`);
|
|
}
|
|
} else if (problem === 'api-missing') {
|
|
if (!hasApiDocs) {
|
|
continue;
|
|
}
|
|
console.error('Invalid API docs link');
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
console.error(` Resolved path: ${badUrl.apiPath}`);
|
|
} else if (problem === 'not-relative') {
|
|
console.error('Links within /docs/ must be relative');
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
} else if (problem === 'github') {
|
|
console.error(
|
|
`Link to docs/ should not use a GitHub URL, use a relative URL instead`,
|
|
);
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
} else if (problem === 'multi-line') {
|
|
console.error(`Links are not allowed to span multiple lines:`);
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url.replace(/\n/g, '\n ')}`);
|
|
} else if (problem === 'bad-anchor') {
|
|
console.error(`Anchor not found in target document`);
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
} else if (problem === 'directory-link') {
|
|
console.error(
|
|
`Link points to a directory instead of a file, use index.md suffix`,
|
|
);
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${url}`);
|
|
} else if (problem === 'invisible-chars') {
|
|
console.error(`Link contains invisible or zero-width characters`);
|
|
console.error(` From: ${basePath}`);
|
|
console.error(` To: ${JSON.stringify(url)}`);
|
|
}
|
|
}
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main().catch(error => {
|
|
console.error(error.stack);
|
|
process.exit(1);
|
|
});
|