314 lines
8.5 KiB
TypeScript
314 lines
8.5 KiB
TypeScript
"use server";
|
|
|
|
import * as cheerio from "cheerio";
|
|
|
|
interface FaqItem {
|
|
question: string;
|
|
answer: string;
|
|
}
|
|
|
|
export interface HeadlineNode {
|
|
tag: string;
|
|
text: string;
|
|
length: number;
|
|
level: number;
|
|
children: HeadlineNode[];
|
|
}
|
|
|
|
export interface ImageAltData {
|
|
src: string;
|
|
alt: string;
|
|
size: number | null;
|
|
}
|
|
|
|
export interface LinkData {
|
|
href: string;
|
|
text: string;
|
|
type: "internal" | "external" | "anchor" | "other";
|
|
rel: string;
|
|
}
|
|
|
|
export interface DetectedSystem {
|
|
name: string;
|
|
}
|
|
|
|
export async function extractMetaData(url: string, keyword?: string) {
|
|
if (!url) {
|
|
return { error: "URL is required." };
|
|
}
|
|
|
|
let formattedUrl = url;
|
|
if (!/^https?:\/\//i.test(url)) {
|
|
formattedUrl = `https://${url}`;
|
|
}
|
|
|
|
try {
|
|
const response = await fetch(formattedUrl, {
|
|
headers: {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return { error: `Failed to fetch URL. Status: ${response.status}` };
|
|
}
|
|
|
|
const html = await response.text();
|
|
const $ = cheerio.load(html);
|
|
|
|
const title =
|
|
$('meta[property="og:title"]').attr("content") ||
|
|
$("title").text() ||
|
|
"No title found";
|
|
const description =
|
|
$('meta[property="og:description"]').attr("content") ||
|
|
$('meta[name="description"]').attr("content") ||
|
|
"No description found";
|
|
const image = $('meta[property="og:image"]').attr("content") || null;
|
|
const canonical = $('link[rel="canonical"]').attr("href") || null;
|
|
const robots = $('meta[name="robots"]').attr("content") || null;
|
|
|
|
// Social Tags
|
|
const openGraph = {
|
|
title: $('meta[property="og:title"]').attr("content") || title,
|
|
description:
|
|
$('meta[property="og:description"]').attr("content") || description,
|
|
image: $('meta[property="og:image"]').attr("content") || image,
|
|
url: $('meta[property="og:url"]').attr("content") || null,
|
|
siteName: $('meta[property="og:site_name"]').attr("content") || null,
|
|
type: $('meta[property="og:type"]').attr("content") || null,
|
|
};
|
|
|
|
const twitter = {
|
|
card: $('meta[name="twitter:card"]').attr("content") || null,
|
|
title: $('meta[name="twitter:title"]').attr("content") || openGraph.title,
|
|
description:
|
|
$('meta[name="twitter:description"]').attr("content") ||
|
|
openGraph.description,
|
|
image: $('meta[name="twitter:image"]').attr("content") || openGraph.image,
|
|
site: $('meta[name="twitter:site"]').attr("content") || null,
|
|
creator: $('meta[name="twitter:creator"]').attr("content") || null,
|
|
};
|
|
|
|
const faqData: FaqItem[] = [];
|
|
const schemaData: any[] = [];
|
|
$('script[type="application/ld+json"]').each((i, el) => {
|
|
const jsonContent = $(el).html();
|
|
if (!jsonContent) return;
|
|
|
|
try {
|
|
const data = JSON.parse(jsonContent);
|
|
const graph = data["@graph"] || [data];
|
|
|
|
schemaData.push(...graph);
|
|
|
|
for (const item of graph) {
|
|
if (item["@type"] === "FAQPage" && Array.isArray(item.mainEntity)) {
|
|
item.mainEntity.forEach((qa: any) => {
|
|
if (
|
|
qa["@type"] === "Question" &&
|
|
qa.name &&
|
|
qa.acceptedAnswer &&
|
|
qa.acceptedAnswer.text
|
|
) {
|
|
faqData.push({
|
|
question: qa.name,
|
|
answer: qa.acceptedAnswer.text,
|
|
});
|
|
}
|
|
});
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Ignore parsing errors
|
|
}
|
|
});
|
|
|
|
const headlines: HeadlineNode[] = [];
|
|
const path: HeadlineNode[] = [];
|
|
|
|
$("h1, h2, h3, h4, h5, h6").each((i, el) => {
|
|
const tag = $(el).prop("tagName").toLowerCase();
|
|
const text = $(el).text().trim();
|
|
if (!text) return;
|
|
|
|
const level = parseInt(tag.replace("h", ""), 10);
|
|
|
|
const node: HeadlineNode = {
|
|
tag,
|
|
text,
|
|
length: text.length,
|
|
level,
|
|
children: [],
|
|
};
|
|
|
|
while (path.length > 0 && path[path.length - 1].level >= level) {
|
|
path.pop();
|
|
}
|
|
|
|
if (path.length === 0) {
|
|
headlines.push(node);
|
|
} else {
|
|
path[path.length - 1].children.push(node);
|
|
}
|
|
|
|
path.push(node);
|
|
});
|
|
|
|
let keywordCount: number | null = null;
|
|
const trimmedKeyword = keyword?.trim();
|
|
if (trimmedKeyword) {
|
|
$("script, style").remove();
|
|
const bodyText = $("body").text();
|
|
const regex = new RegExp(trimmedKeyword, "gi");
|
|
const matches = bodyText.match(regex);
|
|
keywordCount = matches ? matches.length : 0;
|
|
}
|
|
|
|
const imageSrcs: { src: string; alt: string }[] = [];
|
|
$("img").each((i, el) => {
|
|
const src = $(el).attr("src");
|
|
const alt = $(el).attr("alt") || "";
|
|
|
|
if (src) {
|
|
try {
|
|
const absoluteSrc = new URL(src, formattedUrl).href;
|
|
imageSrcs.push({
|
|
src: absoluteSrc,
|
|
alt: alt.trim(),
|
|
});
|
|
} catch (e) {
|
|
// Ignore invalid URLs
|
|
}
|
|
}
|
|
});
|
|
|
|
const imageSizePromises = imageSrcs.map(async (img) => {
|
|
try {
|
|
const res = await fetch(img.src, { method: "HEAD" });
|
|
if (res.ok) {
|
|
const contentLength = res.headers.get("content-length");
|
|
return {
|
|
...img,
|
|
size: contentLength ? parseInt(contentLength, 10) : null,
|
|
};
|
|
}
|
|
return { ...img, size: null };
|
|
} catch (error) {
|
|
return { ...img, size: null };
|
|
}
|
|
});
|
|
|
|
const imageAltData: ImageAltData[] = await Promise.all(imageSizePromises);
|
|
|
|
const links: LinkData[] = [];
|
|
const pageUrl = new URL(formattedUrl);
|
|
|
|
$("a").each((i, el) => {
|
|
const href = $(el).attr("href");
|
|
if (!href) return;
|
|
|
|
const text = $(el).text().trim();
|
|
const rel = $(el).attr("rel") || "";
|
|
let type: LinkData["type"] = "external";
|
|
let absoluteUrl = href;
|
|
|
|
try {
|
|
const linkUrl = new URL(href, formattedUrl);
|
|
absoluteUrl = linkUrl.href;
|
|
if (linkUrl.hostname === pageUrl.hostname) {
|
|
type = "internal";
|
|
}
|
|
} catch (e) {
|
|
if (href.startsWith("#")) type = "anchor";
|
|
else if (href.startsWith("mailto:") || href.startsWith("tel:"))
|
|
type = "other";
|
|
}
|
|
|
|
links.push({ href: absoluteUrl, text, type, rel });
|
|
});
|
|
|
|
const detectedSystems: DetectedSystem[] = [];
|
|
const htmlContent = $.html();
|
|
const uniqueSystems = new Set<string>();
|
|
|
|
// WordPress
|
|
if (
|
|
$('meta[name="generator"][content*="WordPress"]').length > 0 ||
|
|
htmlContent.includes("/wp-content/") ||
|
|
htmlContent.includes("/wp-includes/")
|
|
) {
|
|
uniqueSystems.add("WordPress");
|
|
}
|
|
|
|
// Shopify
|
|
if (
|
|
htmlContent.includes("cdn.shopify.com") ||
|
|
htmlContent.includes("Shopify.theme")
|
|
) {
|
|
uniqueSystems.add("Shopify");
|
|
}
|
|
|
|
// Next.js
|
|
if ($("#__next").length > 0) {
|
|
uniqueSystems.add("Next.js");
|
|
uniqueSystems.add("React"); // Next.js uses React
|
|
}
|
|
|
|
// React (generic)
|
|
if ($("#root").length > 0) {
|
|
uniqueSystems.add("React");
|
|
}
|
|
|
|
// Webflow
|
|
if (
|
|
$('meta[name="generator"][content="Webflow"]').length > 0 ||
|
|
htmlContent.includes("<!-- This site was created in Webflow.")
|
|
) {
|
|
uniqueSystems.add("Webflow");
|
|
}
|
|
|
|
// Wix
|
|
if ($('meta[name="generator"][content*="Wix.com"]').length > 0) {
|
|
uniqueSystems.add("Wix");
|
|
}
|
|
|
|
// Squarespace
|
|
if (htmlContent.includes("static1.squarespace.com")) {
|
|
uniqueSystems.add("Squarespace");
|
|
}
|
|
|
|
uniqueSystems.forEach((system) => {
|
|
detectedSystems.push({ name: system });
|
|
});
|
|
|
|
return {
|
|
data: {
|
|
title,
|
|
description,
|
|
image,
|
|
canonical,
|
|
robots,
|
|
openGraph,
|
|
twitter,
|
|
faq: faqData.length > 0 ? faqData : null,
|
|
schema: schemaData.length > 0 ? schemaData : null,
|
|
headlines: headlines.length > 0 ? headlines : null,
|
|
keyword: trimmedKeyword || null,
|
|
keywordCount,
|
|
images: imageAltData.length > 0 ? imageAltData : null,
|
|
links: links.length > 0 ? links : null,
|
|
systems: detectedSystems.length > 0 ? detectedSystems : null,
|
|
},
|
|
};
|
|
} catch (error) {
|
|
console.error(error);
|
|
if (error instanceof Error && error.message.includes("Invalid URL")) {
|
|
return {
|
|
error: "The provided URL is not valid. Please check and try again.",
|
|
};
|
|
}
|
|
return { error: "An unexpected error occurred while fetching the URL." };
|
|
}
|
|
} |