Skip to content

Commit

Permalink
fix: vmark
Browse files Browse the repository at this point in the history
  • Loading branch information
Byaidu committed Dec 13, 2024
1 parent 833a857 commit b63ff57
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 9 deletions.
16 changes: 8 additions & 8 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
# or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
# 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况
# A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="$v*$"
# A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}"
# B. 文字开头段落(排版相对位置)sstk[-1]!=""
or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0
):
Expand All @@ -247,8 +247,8 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
):
vfix = vstk[0].y0 - child.y0
if sstk[-1] == "":
xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="$v*$")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别
sstk[-1] += f"$v{len(var)}$"
xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别
sstk[-1] += f"{{v{len(var)}}}"
var.append(vstk)
varl.append(vlstk)
varf.append(vfix)
Expand Down Expand Up @@ -305,14 +305,14 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
pass
# 处理结尾
if vstk: # 公式出栈
sstk[-1] += f"$v{len(var)}$"
sstk[-1] += f"{{v{len(var)}}}"
var.append(vstk)
varl.append(vlstk)
varf.append(vfix)
log.debug("\n==========[VSTACK]==========\n")
for id, v in enumerate(var): # 计算公式宽度
l = max([vch.x1 for vch in v]) - v[0].x0
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > v{id} = {"".join([ch.get_text() for ch in v])}')
vlen.append(l)

############################################################
Expand All @@ -323,7 +323,7 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体

@retry(wait=wait_fixed(1))
def worker(s: str): # 多线程翻译
if not s.strip() or re.match(r"^\$v\d+\$$", s): # 空白和公式不翻译
if not s.strip() or re.match(r"^\{v\d+\}$", s): # 空白和公式不翻译
return s
try:
hash_key_paragraph = cache.deterministic_hash(
Expand Down Expand Up @@ -371,8 +371,8 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
while ptr < len(new):
vy_regex = re.match(
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
) # 匹配 {vn} 公式标记
mod = 0 # 文字修饰符
if vy_regex: # 加载公式
ptr += len(vy_regex.group(0))
Expand Down
2 changes: 1 addition & 1 deletion pdf2zh/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def prompt(self, text):
},
{
"role": "user",
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation {{v*}} unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
]

Expand Down

0 comments on commit b63ff57

Please sign in to comment.