Commit a0a7c5a2 by Lin Wang

feat: handle the page content data

parent 5e9d50ba
No preview for this file type
......@@ -10,5 +10,11 @@
},
"dependencies": {
"jsdom": "^26.1.0"
},
"scripts": {
"extract": "DETECTION_MODE=extract bun run src/detect_section_selector_masters/index.ts",
"detect": "DETECTION_MODE=detect bun run src/detect_section_selector_masters/index.ts",
"fix": "DETECTION_MODE=fix bun run src/detect_section_selector_masters/index.ts",
"nav": "DETECTION_MODE=extractNav bun run src/detect_section_selector_masters/index.ts"
}
}
\ No newline at end of file
import { JSDOM } from "jsdom";
export const getNavigationInfo = (items, pathArr: string[] = []) => {
let navigationResult = [];
items?.forEach((item, idx) => {
// 构建对象路径
const currentPathArr = [...pathArr, `items[${idx}]`];
const type = item.type;
const itemsCount = item.items ? item.items.length : 0;
navigationResult.push({
patch: currentPathArr.join("."),
type: type,
itemsCount: itemsCount,
});
// 递归处理 dropdown
if (item.type === "dropdown" && item.items) {
navigationResult = navigationResult.concat(
getNavigationInfo(item.items, currentPathArr)
);
}
});
return navigationResult;
}
// 辅助函数:检查 HTML 字符串中所有 style 属性是否包含 background 或 color
export const hasBackgroundOrColor = (htmlString) => {
if (!htmlString || typeof htmlString !== "string") return false;
// 使用正则表达式匹配所有 style 属性
const styleRegex = /style="([^"]*)"/g;
let match;
while ((match = styleRegex.exec(htmlString)) !== null) {
const style = match[1];
if (style.includes("background") || style.includes("color")) {
return true;
}
}
return false;
}
// 新增:移除 style 中 color 声明的函数
export const removeColorInStyle = (htmlString: string) => {
// 把 style="…color:XXX;…" 中的 color:XXX; 删除
return htmlString.replace(/(style="[^"]*?)\s*color:[^;"]*;?/g, "$1");
}
// 新增:递归清洗节点下所有 RichText
export const cleanRichText = (node: any) => {
if (!node) return;
if (node.type === "RichText" && typeof node.value === "string") {
node.value = removeColorInStyle(node.value);
}
if (node.components) {
Object.values(node.components).forEach(cleanRichText);
}
if (Array.isArray(node.items)) {
node.items.forEach(cleanRichText);
}
if (Array.isArray(node.list)) {
node.list.forEach(cleanRichText);
}
}
// 新增:递归检查节点下所有 RichText 是否有 inline color
export const hasInlineColor = (node: any): boolean => {
if (!node) return false;
if (node.type === "RichText" && typeof node.value === "string") {
return extractInlineColors(node.value).length > 0;
}
if (node.components) {
return Object.values(node.components).some(hasInlineColor);
}
if (Array.isArray(node.items)) {
return node.items.some(hasInlineColor);
}
if (Array.isArray(node.list)) {
return node.list.some(hasInlineColor);
}
return false;
}
/**
* 从一个 HTML 字符串中提取所有 inline color 值
*/
export const extractInlineColors = (html: string): string[] => {
const reg = /color\s*:\s*([^;"]+)/gi;
const result: string[] = [];
let m: RegExpExecArray | null;
while ((m = reg.exec(html))) {
result.push(m[1].trim());
}
return result;
}
/**
* 收集一个 RepeatableItem.components 下所有 RichText 的 inline color
*/
export const getItemRichTextColors = (item: any): string[] => {
if (!item || !item.components) return [];
const out: string[] = [];
Object.values(item?.components ?? {}).forEach((comp: any) => {
if (comp.type === "RichText" && typeof comp.value === "string") {
out.push(...extractInlineColors(comp.value));
}
});
return out;
}
/**
* 如果 color 是 CSS 变量 var(--s-pre-color10),提取数字部分 10
*/
export const extractCssVarIndex = (color: string): number | null => {
const m = color?.match(/var\(--s-pre-color(\d+)\)/i);
return m ? parseInt(m[1], 10) : null;
}
export const addColorToHtmlString = (html) => {
/* use JSDOM start */
const dom = new JSDOM(`<body>${html}</body>`);
const { document, NodeFilter } = dom.window;
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT
);
/* use JSDOM end */
// const parser = new DOMParser();
// const document = parser.parseFromString(html, 'text/html');
// const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
const touched = new Set();
// 匹配 color: #50555c(不区分大小写)
const overrideColorRegex = /color\s*:\s*#50555c/gi;
while (walker.nextNode()) {
const textNode = walker.currentNode;
if (!textNode?.nodeValue.trim()) continue;
const parent = textNode.parentElement;
if (!parent || touched.has(parent)) continue;
touched.add(parent);
const oriStyle = parent.getAttribute("style") || "";
// 如果已有 color: #50555C,则替换成 color:rgb(10, 7, 7)
if (overrideColorRegex.test(oriStyle)) {
const newStyle = oriStyle.replace(overrideColorRegex, "color: #222222");
parent.setAttribute("style", newStyle);
continue;
}
// 如果已含 color 则跳过
if (/color\s*:/i.test(oriStyle)) continue;
// 根据 oriStyle 是否为空决定是否加前置分号
let newStyle = oriStyle.trim();
newStyle += "color: #222222;";
parent.setAttribute("style", newStyle);
}
return document.body.innerHTML;
}
export const getBgValuePath = (patchPath: string): string | null => {
const m = patchPath.match(
/^(components\.repeatable1\.list\[\d+\]\.components)\.[^.]+$/
);
if (!m) return null;
return `${m[1]}.background1.backgroundColor.value`;
}
export const getByPath = (obj: any, path: string): any => {
return path.split(".").reduce((acc, key) => {
const arr = key.match(/^(.+)\[(\d+)\]$/);
if (arr) {
const [, prop, idx] = arr;
return acc?.[prop]?.[Number(idx)];
}
return acc?.[key];
}, obj);
}
\ No newline at end of file
import { writeFileSync, mkdirSync, existsSync } from 'fs'
import { fetchSiteData } from '../clients/bobcat/SiteInfo'
import {
section_selectors,
ai_section_selectors
} from '../constant/section_selectors'
import { mainParse } from './handlePageContent'
/*
detect: 设置限制条件,获取指定有问题的字段
extract: 提出整个 page content 指定的字段数据
fix: 修复 page content 数据,修复后获取完整的 page content
extractNav: 获取导航信息,可以查看下拉菜单有多少个
*/
// demo
type CommandMode = 'detect' | 'extract' | 'fix' | 'extractNav';
// await Promise.all(
// section_selectors.map(async (siteId, index) => {
// const jsonData = await fetchSiteData(siteId);
// // jsonData.content is page data under this site
// return jsonData.content;
// })
// )
\ No newline at end of file
const commandMode = (process.env.DETECTION_MODE as CommandMode) || 'extract';
// 确保输出目录存在
let outDir = ''
if (commandMode === 'detect') {
outDir = 'src/detect_section_selector_masters/detectOutput'
} else if (commandMode === 'extract') {
outDir = 'src/detect_section_selector_masters/extractOutput'
} else if (commandMode === 'fix') {
outDir = 'src/detect_section_selector_masters/fixedOutput'
} else if (commandMode === 'extractNav') {
outDir = 'src/detect_section_selector_masters/extractNavOutput'
}
if (!existsSync(outDir)) mkdirSync(outDir, { recursive: true })
await Promise.all(
section_selectors.map(async (siteId, index) => {
const { content } = await fetchSiteData(siteId)
const { jsonData: pageContent, result, navigationInfo } = mainParse(content) || {}
const file = `${outDir}/${siteId}_${commandMode}.json`
let data = []
if (commandMode === 'detect' || commandMode === 'extract') {
data = result
} else if (commandMode === 'fix') {
data = pageContent
} else if (commandMode === 'extractNav') {
data = navigationInfo
}
writeFileSync(file, JSON.stringify(data, null, 2), 'utf-8')
console.log(`✅ ${siteId} output written to ${file}`)
})
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment