Skip to content

Commit

Permalink
Merge pull request #94 from mdevolde/patch-4-bytes-encoded
Browse files Browse the repository at this point in the history
Correction of incorrect offsets to apply corrections when there are characters encoded on 4 bytes in the text to be corrected
  • Loading branch information
jxmorris12 authored Aug 22, 2024
2 parents 90cfd79 + a1fdbc1 commit 75fbc2c
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions language_tool_python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,26 @@ def parse_url(url_str):
return urllib.parse.urlparse(url_str).geturl()


def _4_bytes_encoded_positions(text: str) -> List[int]:
"""Return a list of positions of 4-byte encoded characters in the text."""
positions = []
char_index = 0
for char in text:
if len(char.encode('utf-8')) == 4:
positions.append(char_index)
# Adding 1 to the index because 4 byte characters are
# 2 bytes in length in LanguageTool, instead of 1 byte in Python.
char_index += 1
char_index += 1
return positions


def correct(text: str, matches: List[Match]) -> str:
"""Automatically apply suggestions to the text."""
# Get the positions of 4-byte encoded characters in the text because without
# carrying out this step, the offsets of the matches could be incorrect.
for match in matches:
match.offset -= sum(1 for i in _4_bytes_encoded_positions(text) if i <= match.offset)
ltext = list(text)
matches = [match for match in matches if match.replacements]
errors = [ltext[match.offset:match.offset + match.errorLength]
Expand Down

0 comments on commit 75fbc2c

Please sign in to comment.