feat:自动剥离大模型返回的markdown代码围栏
This commit is contained in:
35
README.md
35
README.md
@@ -44,6 +44,7 @@
|
|||||||
| `source` | string | 否 | 来源标识,建议填智能体名称、工作流名或插件名,最大 80 个字符 |
|
| `source` | string | 否 | 来源标识,建议填智能体名称、工作流名或插件名,最大 80 个字符 |
|
||||||
| `request_id` | string | 否 | 请求追踪 ID,方便排查问题,最大 120 个字符 |
|
| `request_id` | string | 否 | 请求追踪 ID,方便排查问题,最大 120 个字符 |
|
||||||
| `ttl_days` | integer | 否 | 文件保留天数,默认 7 天,最大 30 天 |
|
| `ttl_days` | integer | 否 | 文件保留天数,默认 7 天,最大 30 天 |
|
||||||
|
| `preserve_markdown_fence` | boolean | 否 | 默认为 `false`。当智能体返回的是 ```html fenced code block 时,服务会自动剥离最外层围栏;如果你就是要保留围栏原文,传 `true` |
|
||||||
|
|
||||||
#### 兼容别名
|
#### 兼容别名
|
||||||
|
|
||||||
@@ -111,6 +112,39 @@
|
|||||||
|
|
||||||
这个地址会直接返回 `text/html` 内容。智能体侧通常只需要使用生成接口返回的 `url` 字段即可。
|
这个地址会直接返回 `text/html` 内容。智能体侧通常只需要使用生成接口返回的 `url` 字段即可。
|
||||||
|
|
||||||
|
## Markdown 代码围栏处理
|
||||||
|
|
||||||
|
很多模型会把 HTML 包在下面这种 Markdown 代码块里:
|
||||||
|
|
||||||
|
```text
|
||||||
|
```html
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>...</html>
|
||||||
|
```
|
||||||
|
```
|
||||||
|
|
||||||
|
当前服务默认会自动剥离最外层围栏,然后再保存 HTML。
|
||||||
|
|
||||||
|
为了避免误判,剥离逻辑是保守的,只有在以下条件同时满足时才会剥离:
|
||||||
|
|
||||||
|
- 整个请求内容几乎就是一个外层 fenced code block
|
||||||
|
- 代码块语言是 `html`、`htm`、`xhtml`、`xml`,或者虽然没写语言但内容明显像 HTML
|
||||||
|
|
||||||
|
以下情况不会剥离:
|
||||||
|
|
||||||
|
- HTML 正文内部只是包含了几个 ``` 代码围栏示例
|
||||||
|
- 代码块语言是 `js`、`python` 等非 HTML
|
||||||
|
- 内容整体看起来不像 HTML
|
||||||
|
|
||||||
|
如果你确实希望把最外层 ``` 原样保留到页面中,可以在请求体里传:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"html_content": "```html\n<div>hello</div>\n```",
|
||||||
|
"preserve_markdown_fence": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## 智能体调用约定
|
## 智能体调用约定
|
||||||
|
|
||||||
建议腾讯云智能体在工具描述中遵守以下规则:
|
建议腾讯云智能体在工具描述中遵守以下规则:
|
||||||
@@ -215,6 +249,7 @@ NEXT_PUBLIC_API_BASE_URL=http://localhost:8000
|
|||||||
| `MAX_HTML_LENGTH` | integer | `200000` | 单次 HTML 最大字节数 |
|
| `MAX_HTML_LENGTH` | integer | `200000` | 单次 HTML 最大字节数 |
|
||||||
| `API_KEY` | string | 空 | 非空时启用 `X-API-Key` 鉴权 |
|
| `API_KEY` | string | 空 | 非空时启用 `X-API-Key` 鉴权 |
|
||||||
| `ALLOW_UNSAFE_HTML` | boolean | `true` | 是否关闭 HTML 安全拦截;当前默认开启,便于执行 JS |
|
| `ALLOW_UNSAFE_HTML` | boolean | `true` | 是否关闭 HTML 安全拦截;当前默认开启,便于执行 JS |
|
||||||
|
| `STRIP_MARKDOWN_CODE_FENCE` | boolean | `true` | 是否自动剥离最外层 Markdown 代码围栏 |
|
||||||
| `ENABLE_REQUEST_DEBUG_LOG` | boolean | `true` | 是否输出 422 请求调试日志 |
|
| `ENABLE_REQUEST_DEBUG_LOG` | boolean | `true` | 是否输出 422 请求调试日志 |
|
||||||
| `REQUEST_LOG_MAX_CHARS` | integer | `10000` | 单次请求日志最多记录多少字符 |
|
| `REQUEST_LOG_MAX_CHARS` | integer | `10000` | 单次请求日志最多记录多少字符 |
|
||||||
|
|
||||||
|
|||||||
@@ -8,5 +8,6 @@ MAX_RETENTION_DAYS=30
|
|||||||
MAX_HTML_LENGTH=200000
|
MAX_HTML_LENGTH=200000
|
||||||
API_KEY=""
|
API_KEY=""
|
||||||
ALLOW_UNSAFE_HTML=true
|
ALLOW_UNSAFE_HTML=true
|
||||||
|
STRIP_MARKDOWN_CODE_FENCE=true
|
||||||
ENABLE_REQUEST_DEBUG_LOG=true
|
ENABLE_REQUEST_DEBUG_LOG=true
|
||||||
REQUEST_LOG_MAX_CHARS=10000
|
REQUEST_LOG_MAX_CHARS=10000
|
||||||
|
|||||||
@@ -8,5 +8,6 @@ MAX_RETENTION_DAYS=30
|
|||||||
MAX_HTML_LENGTH=200000
|
MAX_HTML_LENGTH=200000
|
||||||
API_KEY=""
|
API_KEY=""
|
||||||
ALLOW_UNSAFE_HTML=true
|
ALLOW_UNSAFE_HTML=true
|
||||||
|
STRIP_MARKDOWN_CODE_FENCE=true
|
||||||
ENABLE_REQUEST_DEBUG_LOG=true
|
ENABLE_REQUEST_DEBUG_LOG=true
|
||||||
REQUEST_LOG_MAX_CHARS=10000
|
REQUEST_LOG_MAX_CHARS=10000
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ class Settings:
|
|||||||
max_html_length = max(1024, _get_int_env("MAX_HTML_LENGTH", 200_000))
|
max_html_length = max(1024, _get_int_env("MAX_HTML_LENGTH", 200_000))
|
||||||
api_key = os.getenv("API_KEY", "").strip()
|
api_key = os.getenv("API_KEY", "").strip()
|
||||||
allow_unsafe_html = _get_bool_env("ALLOW_UNSAFE_HTML", False)
|
allow_unsafe_html = _get_bool_env("ALLOW_UNSAFE_HTML", False)
|
||||||
|
strip_markdown_code_fence = _get_bool_env("STRIP_MARKDOWN_CODE_FENCE", True)
|
||||||
enable_request_debug_log = _get_bool_env("ENABLE_REQUEST_DEBUG_LOG", True)
|
enable_request_debug_log = _get_bool_env("ENABLE_REQUEST_DEBUG_LOG", True)
|
||||||
request_log_max_chars = max(256, _get_int_env("REQUEST_LOG_MAX_CHARS", 10_000))
|
request_log_max_chars = max(256, _get_int_env("REQUEST_LOG_MAX_CHARS", 10_000))
|
||||||
|
|
||||||
|
|||||||
@@ -48,6 +48,12 @@ CONTENT_SECURITY_POLICY = "; ".join(
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
MARKDOWN_CODE_FENCE_PATTERN = re.compile(
|
||||||
|
r"^\s*```(?P<language>[a-zA-Z0-9_+-]*)[^\S\r\n]*\r?\n"
|
||||||
|
r"(?P<content>[\s\S]*?)"
|
||||||
|
r"\r?\n```\s*$"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_response_headers() -> dict[str, str]:
|
def build_response_headers() -> dict[str, str]:
|
||||||
headers = {
|
headers = {
|
||||||
@@ -61,6 +67,32 @@ def build_response_headers() -> dict[str, str]:
|
|||||||
return headers
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def strip_outer_markdown_code_fence(
|
||||||
|
raw_content: str,
|
||||||
|
preserve_markdown_fence: bool,
|
||||||
|
) -> str:
|
||||||
|
if preserve_markdown_fence or not settings.strip_markdown_code_fence:
|
||||||
|
return raw_content
|
||||||
|
|
||||||
|
match = MARKDOWN_CODE_FENCE_PATTERN.match(raw_content)
|
||||||
|
if not match:
|
||||||
|
return raw_content
|
||||||
|
|
||||||
|
language = match.group("language").strip().lower()
|
||||||
|
content = match.group("content")
|
||||||
|
looks_like_html = bool(
|
||||||
|
re.search(r"<!doctype\s+html|<html\b|<[a-z][\w:-]*\b", content, re.IGNORECASE)
|
||||||
|
)
|
||||||
|
|
||||||
|
if language and language not in {"html", "htm", "xhtml", "xml"}:
|
||||||
|
return raw_content
|
||||||
|
|
||||||
|
if not language and not looks_like_html:
|
||||||
|
return raw_content
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
def require_api_key(x_api_key: str | None = Header(default=None, alias="X-API-Key")) -> None:
|
def require_api_key(x_api_key: str | None = Header(default=None, alias="X-API-Key")) -> None:
|
||||||
if not settings.api_key:
|
if not settings.api_key:
|
||||||
return
|
return
|
||||||
@@ -261,7 +293,11 @@ def generate_html(
|
|||||||
unique_id = generate_unique_id(db)
|
unique_id = generate_unique_id(db)
|
||||||
html_filename = f"{unique_id}.html"
|
html_filename = f"{unique_id}.html"
|
||||||
html_path = settings.html_storage_dir / html_filename
|
html_path = settings.html_storage_dir / html_filename
|
||||||
html_document = build_html_document(request.html_content, request.title)
|
normalized_html_content = strip_outer_markdown_code_fence(
|
||||||
|
request.html_content,
|
||||||
|
preserve_markdown_fence=request.preserve_markdown_fence,
|
||||||
|
)
|
||||||
|
html_document = build_html_document(normalized_html_content, request.title)
|
||||||
expires_at = datetime.utcnow() + timedelta(
|
expires_at = datetime.utcnow() + timedelta(
|
||||||
days=request.ttl_days or settings.default_retention_days
|
days=request.ttl_days or settings.default_retention_days
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -30,6 +30,13 @@ class HTMLGenerateRequest(BaseModel):
|
|||||||
ge=1,
|
ge=1,
|
||||||
description="Optional retention days for the file.",
|
description="Optional retention days for the file.",
|
||||||
)
|
)
|
||||||
|
preserve_markdown_fence: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description=(
|
||||||
|
"When true, keeps outer Markdown code fences instead of stripping them "
|
||||||
|
"from the HTML payload."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
@root_validator(pre=True)
|
@root_validator(pre=True)
|
||||||
def normalize_aliases(cls, values: dict) -> dict:
|
def normalize_aliases(cls, values: dict) -> dict:
|
||||||
@@ -73,6 +80,17 @@ class HTMLGenerateRequest(BaseModel):
|
|||||||
normalized = value.strip()
|
normalized = value.strip()
|
||||||
return normalized or None
|
return normalized or None
|
||||||
|
|
||||||
|
@validator("preserve_markdown_fence", pre=True)
|
||||||
|
def normalize_preserve_markdown_fence(cls, value: bool | str | None) -> bool:
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return value
|
||||||
|
if value is None:
|
||||||
|
return False
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
return bool(value)
|
||||||
|
|
||||||
@validator("ttl_days")
|
@validator("ttl_days")
|
||||||
def validate_ttl_days(cls, value: int | None) -> int | None:
|
def validate_ttl_days(cls, value: int | None) -> int | None:
|
||||||
if value is None:
|
if value is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user