Fixed #5025 -- Add a "truncatechars" template filter. Many thanks to Chris Beaven.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@16542 bcc190cf-cafb-0310-a4f2-bffc1f526a37
2025-09-21 18:00:45 +00:00 · 2011-07-14 13:47:10 +00:00 · 2011-07-14 13:47:10 +00:00 · 3b77458371
commit 3b77458371
parent 12b7c2a702
7 changed files with 281 additions and 86 deletions
--- a/django/utils/text.py
+++ b/django/utils/text.py
@ -1,4 +1,6 @@
 import re
+import unicodedata
+import warnings
 from gzip import GzipFile
 from htmlentitydefs import name2codepoint

@ -8,14 +10,18 @@ except ImportError:
    from StringIO import StringIO

 from django.utils.encoding import force_unicode
-from django.utils.functional import allow_lazy
-from django.utils.translation import ugettext_lazy, ugettext as _
-
+from django.utils.functional import allow_lazy, SimpleLazyObject
+from django.utils.translation import ugettext_lazy, ugettext as _, pgettext

 # Capitalizes the first letter of a string.
 capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
 capfirst = allow_lazy(capfirst, unicode)

+# Set up regular expressions
+re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
+re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
+
+
 def wrap(text, width):
    """
    A word-wrap function that preserves existing line breaks and most spaces in
@ -44,88 +50,172 @@ def wrap(text, width):
    return u''.join(_generator())
 wrap = allow_lazy(wrap, unicode)

-def truncate_words(s, num, end_text='...'):
-    """Truncates a string after a certain number of words. Takes an optional
-    argument of what should be used to notify that the string has been
-    truncated, defaulting to ellipsis (...)

-    Newlines in the string will be stripped.
+class Truncator(SimpleLazyObject):
    """
-    s = force_unicode(s)
-    length = int(num)
-    words = s.split()
-    if len(words) > length:
-        words = words[:length]
-        if not words[-1].endswith(end_text):
-            words.append(end_text)
-    return u' '.join(words)
+    An object used to truncate text, either by characters or words.
+    """
+    def __init__(self, text):
+        super(Truncator, self).__init__(lambda: force_unicode(text))
+
+    def add_truncation_text(self, text, truncate=None):
+        if truncate is None:
+            truncate = pgettext(
+                'String to return when truncating text',
+                u'%(truncated_text)s...')
+        truncate = force_unicode(truncate)
+        if '%(truncated_text)s' in truncate:
+            return truncate % {'truncated_text': text}
+        # The truncation text didn't contain the %(truncated_text)s string
+        # replacement argument so just append it to the text.
+        if text.endswith(truncate):
+            # But don't append the truncation text if the current text already
+            # ends in this.
+            return text
+        return '%s%s' % (text, truncate)
+
+    def chars(self, num, truncate=None):
+        """
+        Returns the text truncated to be no longer than the specified number
+        of characters.
+
+        Takes an optional argument of what should be used to notify that the
+        string has been truncated, defaulting to a translatable string of an
+        ellipsis (...).
+        """
+        length = int(num)
+        text = unicodedata.normalize('NFC', self._wrapped)
+
+        # Calculate the length to truncate to (max length - end_text length)
+        truncate_len = length
+        for char in self.add_truncation_text('', truncate):
+            if not unicodedata.combining(char):
+                truncate_len -= 1
+                if truncate_len == 0:
+                    break
+
+        s_len = 0
+        end_index = None
+        for i, char in enumerate(text):
+            if unicodedata.combining(char):
+                # Don't consider combining characters
+                # as adding to the string length
+                continue
+            s_len += 1
+            if end_index is None and s_len > truncate_len:
+                end_index = i
+            if s_len > length:
+                # Return the truncated string
+                return self.add_truncation_text(text[:end_index or 0],
+                                                truncate)
+
+        # Return the original string since no truncation was necessary
+        return text
+    chars = allow_lazy(chars)
+
+    def words(self, num, truncate=None, html=False):
+        """
+        Truncates a string after a certain number of words. Takes an optional
+        argument of what should be used to notify that the string has been
+        truncated, defaulting to ellipsis (...).
+        """
+        length = int(num)
+        if html:
+            return self._html_words(length, truncate)
+        return self._text_words(length, truncate)
+    words = allow_lazy(words)
+
+    def _text_words(self, length, truncate):
+        """
+        Truncates a string after a certain number of words.
+
+        Newlines in the string will be stripped.
+        """
+        words = self._wrapped.split()
+        if len(words) > length:
+            words = words[:length]
+            return self.add_truncation_text(u' '.join(words), truncate)
+        return u' '.join(words)
+
+    def _html_words(self, length, truncate):
+        """
+        Truncates HTML to a certain number of words (not counting tags and
+        comments). Closes opened tags if they were correctly closed in the
+        given HTML.
+
+        Newlines in the HTML are preserved.
+        """
+        if length <= 0:
+            return u''
+        html4_singlets = (
+            'br', 'col', 'link', 'base', 'img',
+            'param', 'area', 'hr', 'input'
+        )
+        # Count non-HTML words and keep note of open tags
+        pos = 0
+        end_text_pos = 0
+        words = 0
+        open_tags = []
+        while words <= length:
+            m = re_words.search(self._wrapped, pos)
+            if not m:
+                # Checked through whole string
+                break
+            pos = m.end(0)
+            if m.group(1):
+                # It's an actual non-HTML word
+                words += 1
+                if words == length:
+                    end_text_pos = pos
+                continue
+            # Check for tag
+            tag = re_tag.match(m.group(0))
+            if not tag or end_text_pos:
+                # Don't worry about non tags or tags after our truncate point
+                continue
+            closing_tag, tagname, self_closing = tag.groups()
+            # Element names are always case-insensitive
+            tagname = tagname.lower()
+            if self_closing or tagname in html4_singlets:
+                pass
+            elif closing_tag:
+                # Check for match in open tags list
+                try:
+                    i = open_tags.index(tagname)
+                except ValueError:
+                    pass
+                else:
+                    # SGML: An end tag closes, back to the matching start tag,
+                    # all unclosed intervening start tags with omitted end tags
+                    open_tags = open_tags[i + 1:]
+            else:
+                # Add it to the start of the open tags list
+                open_tags.insert(0, tagname)
+        if words <= length:
+            # Don't try to close tags if we don't need to truncate
+            return self._wrapped
+        out = self._wrapped[:end_text_pos]
+        truncate_text = self.add_truncation_text('', truncate)
+        if truncate_text:
+            out += truncate_text
+        # Close any tags still open
+        for tag in open_tags:
+            out += '</%s>' % tag
+        # Return string
+        return out
+
+def truncate_words(s, num, end_text='...'):
+    warnings.warn('This function has been deprecated. Use the Truncator class '
+        'in django.utils.text instead.', category=PendingDeprecationWarning)
+    truncate = end_text and ' %s' % end_text or ''
+    return Truncator(s).words(num, truncate=truncate)
 truncate_words = allow_lazy(truncate_words, unicode)

 def truncate_html_words(s, num, end_text='...'):
-    """Truncates HTML to a certain number of words (not counting tags and
-    comments). Closes opened tags if they were correctly closed in the given
-    html. Takes an optional argument of what should be used to notify that the
-    string has been truncated, defaulting to ellipsis (...).
-
-    Newlines in the HTML are preserved.
-    """
-    s = force_unicode(s)
-    length = int(num)
-    if length <= 0:
-        return u''
-    html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
-    # Set up regular expressions
-    re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
-    re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
-    # Count non-HTML words and keep note of open tags
-    pos = 0
-    end_text_pos = 0
-    words = 0
-    open_tags = []
-    while words <= length:
-        m = re_words.search(s, pos)
-        if not m:
-            # Checked through whole string
-            break
-        pos = m.end(0)
-        if m.group(1):
-            # It's an actual non-HTML word
-            words += 1
-            if words == length:
-                end_text_pos = pos
-            continue
-        # Check for tag
-        tag = re_tag.match(m.group(0))
-        if not tag or end_text_pos:
-            # Don't worry about non tags or tags after our truncate point
-            continue
-        closing_tag, tagname, self_closing = tag.groups()
-        tagname = tagname.lower()  # Element names are always case-insensitive
-        if self_closing or tagname in html4_singlets:
-            pass
-        elif closing_tag:
-            # Check for match in open tags list
-            try:
-                i = open_tags.index(tagname)
-            except ValueError:
-                pass
-            else:
-                # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
-                open_tags = open_tags[i+1:]
-        else:
-            # Add it to the start of the open tags list
-            open_tags.insert(0, tagname)
-    if words <= length:
-        # Don't try to close tags if we don't need to truncate
-        return s
-    out = s[:end_text_pos]
-    if end_text:
-        out += ' ' + end_text
-    # Close any tags still open
-    for tag in open_tags:
-        out += '</%s>' % tag
-    # Return string
-    return out
+    warnings.warn('This function has been deprecated. Use the Truncator class '
+        'in django.utils.text instead.', category=PendingDeprecationWarning)
+    truncate = end_text and ' %s' % end_text or ''
+    return Truncator(s).words(num, truncate=truncate, html=True)
 truncate_html_words = allow_lazy(truncate_html_words, unicode)

 def get_valid_filename(s):