Commit 68c17469 authored by Steven's avatar Steven

fix(markdown): fix UTF-8 truncation for CJK characters in snippet generation

The truncateAtWord function was slicing strings by byte position instead of
character position. When truncating text with multi-byte UTF-8 characters
(like CJK), this could cut in the middle of a character, creating invalid
UTF-8 and causing gRPC marshaling errors.

Fixed by converting to runes before truncation to ensure we always cut at
proper character boundaries. Added test cases for CJK characters.

Fixes #5276

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: 's avatarClaude <noreply@anthropic.com>
parent e17cd163
......@@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
}
// truncateAtWord truncates a string at the last word boundary before maxLength.
// maxLength is treated as a rune (character) count to properly handle UTF-8 multi-byte characters.
func truncateAtWord(s string, maxLength int) string {
if len(s) <= maxLength {
// Convert to runes to properly handle multi-byte UTF-8 characters
runes := []rune(s)
if len(runes) <= maxLength {
return s
}
// Truncate to max length
truncated := s[:maxLength]
// Truncate to max length (by character count, not byte count)
truncated := string(runes[:maxLength])
// Find last space
// Find last space to avoid cutting in the middle of a word
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
if lastSpace > 0 {
truncated = truncated[:lastSpace]
......
......@@ -382,6 +382,18 @@ func TestTruncateAtWord(t *testing.T) {
maxLength: 10,
expected: "supercalif ...",
},
{
name: "CJK characters without spaces",
input: "这是一个很长的中文句子没有空格的情况下也要正确处理",
maxLength: 15,
expected: "这是一个很长的中文句子没有空格 ...",
},
{
name: "mixed CJK and Latin",
input: "这是中文mixed with English文字",
maxLength: 10,
expected: "这是中文mixed ...",
},
}
for _, tt := range tests {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment