Fix double-width characters disappearing when wrapping (#3180)

* Update docstring for `Text.wrap`s width parameter to indicate that it's referring to the number of *single-width* characters. Also a small addition to the gitignore file. * Working on double width wrapping fixes * Chop cells to fit to width * Fix folding when theres already text on line * Update wrapping logic to fix issues with CJK charcters disappearing when the "fold" location sat *within* a double-width character. Ensure we retain browser logic of: if there is no space on the current line, move to a new line, and if theres not enough space on the entire new line, fold the text over multiple lines at appropriate locations. * Remove old TODO comments * Add regression test note * Rename function to avoid breaking change * Update CHANGELOG * Remove old comment that is no longer relevant * Cover off some wrapping edge cases * Adding docstrings to tests explaining their purpose * Renaming a local, function scope function alias * Update rich/_wrap.py Co-authored-by: Rodrigo Girão Serrão <5621605+rodrigogiraoserrao@users.noreply.github.com> * PR feedback * Testing wrapping with trailing and leading whitespace * Improve docstring wording --------- Co-authored-by: Rodrigo Girão Serrão <5621605+rodrigogiraoserrao@users.noreply.github.com>
2023-11-14 10:03:54 +00:00 · 2023-11-14 10:03:54 +00:00 · 59b1aca63b
parent b32e42bda0
commit 59b1aca63b
6 changed files with 177 additions and 38 deletions
--- a/.gitignore
+++ b/.gitignore
@ -117,3 +117,5 @@ venv.bak/
 # airspeed velocity
 benchmarks/env/
 benchmarks/html/
+
+sandbox/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Some text goes missing during wrapping when it contains double width characters https://github.com/Textualize/rich/issues/3176
 - Ensure font is correctly inherited in exported HTML https://github.com/Textualize/rich/issues/3104
 - Fixed typing for `FloatPrompt`.

--- a/rich/_wrap.py
+++ b/rich/_wrap.py
@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import re
-from typing import Iterable, List, Tuple
+from typing import Iterable

 from ._loop import loop_last
 from .cells import cell_len, chop_cells
@ -7,7 +9,11 @@ from .cells import cell_len, chop_cells
 re_word = re.compile(r"\s*\S+\s*")


-def words(text: str) -> Iterable[Tuple[int, int, str]]:
+def words(text: str) -> Iterable[tuple[int, int, str]]:
+    """Yields each word from the text as a tuple
+    containing (start_index, end_index, word). A "word" in this context may
+    include the actual word and any whitespace to the right.
+    """
    position = 0
    word_match = re_word.match(text, position)
    while word_match is not None:
@ -17,35 +23,59 @@ def words(text: str) -> Iterable[Tuple[int, int, str]]:
        word_match = re_word.match(text, end)


-def divide_line(text: str, width: int, fold: bool = True) -> List[int]:
-    divides: List[int] = []
-    append = divides.append
-    line_position = 0
+def divide_line(text: str, width: int, fold: bool = True) -> list[int]:
+    """Given a string of text, and a width (measured in cells), return a list
+    of cell offsets which the string should be split at in order for it to fit
+    within the given width.
+
+    Args:
+        text: The text to examine.
+        width: The available cell width.
+        fold: If True, words longer than `width` will be folded onto a new line.
+
+    Returns:
+        A list of indices to break the line at.
+    """
+    break_positions: list[int] = []  # offsets to insert the breaks at
+    append = break_positions.append
+    cell_offset = 0
    _cell_len = cell_len
+
    for start, _end, word in words(text):
        word_length = _cell_len(word.rstrip())
-        if line_position + word_length > width:
+        remaining_space = width - cell_offset
+        word_fits_remaining_space = remaining_space >= word_length
+
+        if word_fits_remaining_space:
+            # Simplest case - the word fits within the remaining width for this line.
+            cell_offset += _cell_len(word)
+        else:
+            # Not enough space remaining for this word on the current line.
            if word_length > width:
+                # The word doesn't fit on any line, so we can't simply
+                # place it on the next line...
                if fold:
-                    chopped_words = chop_cells(word, max_size=width, position=0)
-                    for last, line in loop_last(chopped_words):
+                    # Fold the word across multiple lines.
+                    folded_word = chop_cells(word, width=width)
+                    for last, line in loop_last(folded_word):
                        if start:
                            append(start)
-
                        if last:
-                            line_position = _cell_len(line)
+                            cell_offset = _cell_len(line)
                        else:
                            start += len(line)
                else:
+                    # Folding isn't allowed, so crop the word.
                    if start:
                        append(start)
-                    line_position = _cell_len(word)
-            elif line_position and start:
+                    cell_offset = _cell_len(word)
+            elif cell_offset and start:
+                # The word doesn't fit within the remaining space on the current
+                # line, but it *can* fit on to the next (empty) line.
                append(start)
-                line_position = _cell_len(word)
-        else:
-            line_position += _cell_len(word)
-    return divides
+                cell_offset = _cell_len(word)
+
+    return break_positions


 if __name__ == "__main__":  # pragma: no cover
@ -53,4 +83,11 @@ if __name__ == "__main__":  # pragma: no cover

    console = Console(width=10)
    console.print("12345 abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPQRSTUVWXYZ 12345")
-    print(chop_cells("abcdefghijklmnopqrstuvwxyz", 10, position=2))
+    print(chop_cells("abcdefghijklmnopqrstuvwxyz", 10))
+
+    console = Console(width=20)
+    console.rule()
+    console.print("TextualはPythonの高速アプリケーション開発フレームワークです")
+
+    console.rule()
+    console.print("アプリケーションは1670万色を使用でき")
--- a/rich/cells.py
+++ b/rich/cells.py
@ -1,6 +1,8 @@
+from __future__ import annotations
+
 import re
 from functools import lru_cache
-from typing import Callable, List
+from typing import Callable

 from ._cell_widths import CELL_WIDTHS

@ -119,27 +121,39 @@ def set_cell_size(text: str, total: int) -> str:
            start = pos


-# TODO: This is inefficient
-# TODO: This might not work with CWJ type characters
-def chop_cells(text: str, max_size: int, position: int = 0) -> List[str]:
-    """Break text in to equal (cell) length strings, returning the characters in reverse
-    order"""
-    _get_character_cell_size = get_character_cell_size
-    characters = [
-        (character, _get_character_cell_size(character)) for character in text
-    ]
-    total_size = position
-    lines: List[List[str]] = [[]]
-    append = lines[-1].append
+def chop_cells(
+    text: str,
+    width: int,
+) -> list[str]:
+    """Split text into lines such that each line fits within the available (cell) width.

-    for character, size in reversed(characters):
-        if total_size + size > max_size:
-            lines.append([character])
-            append = lines[-1].append
-            total_size = size
+    Args:
+        text: The text to fold such that it fits in the given width.
+        width: The width available (number of cells).
+
+    Returns:
+        A list of strings such that each string in the list has cell width
+        less than or equal to the available width.
+    """
+    _get_character_cell_size = get_character_cell_size
+    lines: list[list[str]] = [[]]
+
+    append_new_line = lines.append
+    append_to_last_line = lines[-1].append
+
+    total_width = 0
+
+    for character in text:
+        cell_width = _get_character_cell_size(character)
+        char_doesnt_fit = total_width + cell_width > width
+
+        if char_doesnt_fit:
+            append_new_line([character])
+            append_to_last_line = lines[-1].append
+            total_width = cell_width
        else:
-            total_size += size
-            append(character)
+            append_to_last_line(character)
+            total_width += cell_width

    return ["".join(line) for line in lines]

--- a/tests/test_cells.py
+++ b/tests/test_cells.py
@ -1,4 +1,5 @@
 from rich import cells
+from rich.cells import chop_cells


 def test_cell_len_long_string():
@ -40,3 +41,21 @@ def test_set_cell_size_infinite():
            )
            == size
        )
+
+
+def test_chop_cells():
+    """Simple example of splitting cells into lines of width 3."""
+    text = "abcdefghijk"
+    assert chop_cells(text, 3) == ["abc", "def", "ghi", "jk"]
+
+
+def test_chop_cells_double_width_boundary():
+    """The available width lies within a double-width character."""
+    text = "ありがとう"
+    assert chop_cells(text, 3) == ["あ", "り", "が", "と", "う"]
+
+
+def test_chop_cells_mixed_width():
+    """Mixed single and double-width characters."""
+    text = "あ1り234が5と6う78"
+    assert chop_cells(text, 3) == ["あ1", "り2", "34", "が5", "と6", "う7", "8"]
--- a/tests/test_text.py
+++ b/tests/test_text.py
@ -449,6 +449,20 @@ def test_wrap_cjk_width_mid_character():
    ]


+def test_wrap_cjk_mixed():
+    """Regression test covering https://github.com/Textualize/rich/issues/3176 and
+    https://github.com/Textualize/textual/issues/3567 - double width characters could
+    result in text going missing when wrapping."""
+    text = Text("123ありがとうございました")
+    console = Console(width=20)  # let's ensure the width passed to wrap() wins.
+
+    wrapped_lines = text.wrap(console, width=8)
+    with console.capture() as capture:
+        console.print(wrapped_lines)
+
+    assert capture.get() == "123あり\nがとうご\nざいまし\nた\n"
+
+
 def test_wrap_long():
    text = Text("abracadabra", justify="left")
    lines = text.wrap(Console(), 4)
@ -497,6 +511,47 @@ def test_wrap_long_words_2():
    ]


+def test_wrap_long_words_followed_by_other_words():
+    """After folding a word across multiple lines, we should continue from
+    the next word immediately after the folded word (don't take a newline
+    following completion of the folded word)."""
+    text = Text("123 12345678 123 123")
+    lines = text.wrap(Console(), 6)
+    assert lines._lines == [
+        Text("123 "),
+        Text("123456"),
+        Text("78 123"),
+        Text("123"),
+    ]
+
+
+def test_wrap_long_word_preceeded_by_word_of_full_line_length():
+    """The width of the first word is the same as the available width.
+    Ensures that folding works correctly when there's no space available
+    on the current line."""
+    text = Text("123456 12345678 123 123")
+    lines = text.wrap(Console(), 6)
+    assert lines._lines == [
+        Text("123456"),
+        Text("123456"),
+        Text("78 123"),
+        Text("123"),
+    ]
+
+
+def test_wrap_multiple_consecutive_spaces():
+    """Adding multiple consecutive spaces at the end of a line does not impact
+    the location at which a break will be added during the process of wrapping."""
+    text = Text("123456    12345678 123 123")
+    lines = text.wrap(Console(), 6)
+    assert lines._lines == [
+        Text("123456"),
+        Text("123456"),
+        Text("78 123"),
+        Text("123"),
+    ]
+
+
 def test_wrap_long_words_justify_left():
    text = Text("X 123456789", justify="left")
    lines = text.wrap(Console(), 4)
@ -508,6 +563,17 @@ def test_wrap_long_words_justify_left():
    assert lines[3] == Text("9   ")


+def test_wrap_leading_and_trailing_whitespace():
+    text = Text("   123  456 789   ")
+    lines = text.wrap(Console(), 4)
+    assert lines._lines == [
+        Text("   1"),
+        Text("23  "),
+        Text("456 "),
+        Text("789 "),
+    ]
+
+
 def test_no_wrap_no_crop():
    text = Text("Hello World!" * 3)