diff --git a/README.md b/README.md index dd52f71..a6b49fb 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ | `source` | string | 否 | 来源标识,建议填智能体名称、工作流名或插件名,最大 80 个字符 | | `request_id` | string | 否 | 请求追踪 ID,方便排查问题,最大 120 个字符 | | `ttl_days` | integer | 否 | 文件保留天数,默认 7 天,最大 30 天 | +| `preserve_markdown_fence` | boolean | 否 | 默认为 `false`。当智能体返回的是 ```html fenced code block 时,服务会自动剥离最外层围栏;如果你就是要保留围栏原文,传 `true` | #### 兼容别名 @@ -111,6 +112,39 @@ 这个地址会直接返回 `text/html` 内容。智能体侧通常只需要使用生成接口返回的 `url` 字段即可。 +## Markdown 代码围栏处理 + +很多模型会把 HTML 包在下面这种 Markdown 代码块里: + +```text +```html + +... +``` +``` + +当前服务默认会自动剥离最外层围栏,然后再保存 HTML。 + +为了避免误判,剥离逻辑是保守的,只有在以下条件同时满足时才会剥离: + +- 整个请求内容几乎就是一个外层 fenced code block +- 代码块语言是 `html`、`htm`、`xhtml`、`xml`,或者虽然没写语言但内容明显像 HTML + +以下情况不会剥离: + +- HTML 正文内部只是包含了几个 ``` 代码围栏示例 +- 代码块语言是 `js`、`python` 等非 HTML +- 内容整体看起来不像 HTML + +如果你确实希望把最外层 ``` 原样保留到页面中,可以在请求体里传: + +```json +{ + "html_content": "```html\n
hello
\n```", + "preserve_markdown_fence": true +} +``` + ## 智能体调用约定 建议腾讯云智能体在工具描述中遵守以下规则: @@ -215,6 +249,7 @@ NEXT_PUBLIC_API_BASE_URL=http://localhost:8000 | `MAX_HTML_LENGTH` | integer | `200000` | 单次 HTML 最大字节数 | | `API_KEY` | string | 空 | 非空时启用 `X-API-Key` 鉴权 | | `ALLOW_UNSAFE_HTML` | boolean | `true` | 是否关闭 HTML 安全拦截;当前默认开启,便于执行 JS | +| `STRIP_MARKDOWN_CODE_FENCE` | boolean | `true` | 是否自动剥离最外层 Markdown 代码围栏 | | `ENABLE_REQUEST_DEBUG_LOG` | boolean | `true` | 是否输出 422 请求调试日志 | | `REQUEST_LOG_MAX_CHARS` | integer | `10000` | 单次请求日志最多记录多少字符 | diff --git a/backend/.env b/backend/.env index 6230986..4e2d161 100644 --- a/backend/.env +++ b/backend/.env @@ -8,5 +8,6 @@ MAX_RETENTION_DAYS=30 MAX_HTML_LENGTH=200000 API_KEY="" ALLOW_UNSAFE_HTML=true +STRIP_MARKDOWN_CODE_FENCE=true ENABLE_REQUEST_DEBUG_LOG=true REQUEST_LOG_MAX_CHARS=10000 diff --git a/backend/.env.example b/backend/.env.example index 637be23..573e70b 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -8,5 +8,6 @@ MAX_RETENTION_DAYS=30 MAX_HTML_LENGTH=200000 API_KEY="" ALLOW_UNSAFE_HTML=true +STRIP_MARKDOWN_CODE_FENCE=true ENABLE_REQUEST_DEBUG_LOG=true REQUEST_LOG_MAX_CHARS=10000 diff --git a/backend/app/config.py b/backend/app/config.py index 0ebc304..27c5eb6 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -76,6 +76,7 @@ class Settings: max_html_length = max(1024, _get_int_env("MAX_HTML_LENGTH", 200_000)) api_key = os.getenv("API_KEY", "").strip() allow_unsafe_html = _get_bool_env("ALLOW_UNSAFE_HTML", False) + strip_markdown_code_fence = _get_bool_env("STRIP_MARKDOWN_CODE_FENCE", True) enable_request_debug_log = _get_bool_env("ENABLE_REQUEST_DEBUG_LOG", True) request_log_max_chars = max(256, _get_int_env("REQUEST_LOG_MAX_CHARS", 10_000)) diff --git a/backend/app/routers/html.py b/backend/app/routers/html.py index 2d9f8bd..227f38c 100644 --- a/backend/app/routers/html.py +++ b/backend/app/routers/html.py @@ -48,6 +48,12 @@ CONTENT_SECURITY_POLICY = "; ".join( ] ) +MARKDOWN_CODE_FENCE_PATTERN = re.compile( + r"^\s*```(?P[a-zA-Z0-9_+-]*)[^\S\r\n]*\r?\n" + r"(?P[\s\S]*?)" + r"\r?\n```\s*$" +) + def build_response_headers() -> dict[str, str]: headers = { @@ -61,6 +67,32 @@ def build_response_headers() -> dict[str, str]: return headers +def strip_outer_markdown_code_fence( + raw_content: str, + preserve_markdown_fence: bool, +) -> str: + if preserve_markdown_fence or not settings.strip_markdown_code_fence: + return raw_content + + match = MARKDOWN_CODE_FENCE_PATTERN.match(raw_content) + if not match: + return raw_content + + language = match.group("language").strip().lower() + content = match.group("content") + looks_like_html = bool( + re.search(r" None: if not settings.api_key: return @@ -261,7 +293,11 @@ def generate_html( unique_id = generate_unique_id(db) html_filename = f"{unique_id}.html" html_path = settings.html_storage_dir / html_filename - html_document = build_html_document(request.html_content, request.title) + normalized_html_content = strip_outer_markdown_code_fence( + request.html_content, + preserve_markdown_fence=request.preserve_markdown_fence, + ) + html_document = build_html_document(normalized_html_content, request.title) expires_at = datetime.utcnow() + timedelta( days=request.ttl_days or settings.default_retention_days ) diff --git a/backend/app/schemas.py b/backend/app/schemas.py index f2df9bd..5161830 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -30,6 +30,13 @@ class HTMLGenerateRequest(BaseModel): ge=1, description="Optional retention days for the file.", ) + preserve_markdown_fence: bool = Field( + default=False, + description=( + "When true, keeps outer Markdown code fences instead of stripping them " + "from the HTML payload." + ), + ) @root_validator(pre=True) def normalize_aliases(cls, values: dict) -> dict: @@ -73,6 +80,17 @@ class HTMLGenerateRequest(BaseModel): normalized = value.strip() return normalized or None + @validator("preserve_markdown_fence", pre=True) + def normalize_preserve_markdown_fence(cls, value: bool | str | None) -> bool: + if isinstance(value, bool): + return value + if value is None: + return False + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "on"} + + return bool(value) + @validator("ttl_days") def validate_ttl_days(cls, value: int | None) -> int | None: if value is None: