Collator.lstripwords: Strip words off the start and append to the end.
[python-collate.git] / collate / _abcollator.py
1 """Abstract base collator."""
2
3 import re
4
5 import collate.strings
6
7 class Collator(object):
8 """Abstract base class for Collators.
9
10 Attributes:
11 locale - the collator follows rules for this locale
12 encoding - assumed string encoding
13 """
14
15 locale = "C"
16 encoding = "ascii"
17
18 def __init__(self, locale=None, encoding=None):
19 pass
20
21 def cmp(self, string1, string2):
22 """Return negative if a < b, zero if a == b, positive if a > b."""
23 return cmp(self.key(string1), self.key(string2))
24
25 def key(self, string):
26 """Return a good sorting key for the string.
27
28 The sort key should be considered an opaque value which is
29 only meaningful when compared to other sort keys from the same
30 collator.
31 """
32 return self.unicode(string)
33
34 def words(self, string, sep=re.compile(r"(\s+)", re.UNICODE)):
35 """Split the string into separate words."""
36 return re.split(sep, self.unicode(string))
37
38 def sortemekey(self, string):
39 """Return a key based on sortemes of a string.
40
41 A sorteme, by analogy with grapheme/morpheme/etc. is an atom
42 of sort information. This is larger than a word boundry but
43 smaller than a sentence boundry; roughly, a sorteme boundry
44 occurs between letters and numbers, between numbers and
45 numbers if 'too much' punctuation exists in between, between
46 lines.
47 """
48 string = self.unicode(string)
49 # Shove the sortkeyed original string on the end to resolve
50 # ties intelligently.
51 return (collate.strings.sortemes(string, self.key),
52 self.key(string))
53
54 def unicode(self, string):
55 """Convert a str to a unicode using the collator encoding."""
56 try:
57 return unicode(string)
58 except UnicodeError:
59 return string.decode(self.encoding, 'replace')
60
61 def str(self, string):
62 """Convert a unicode to a str using the collator encoding."""
63 try:
64 return str(string)
65 except UnicodeError:
66 return string.encode(self.encoding, 'replace')
67
68 def lstripwords(
69 self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
70 """Strip words and whitespace from the start of a string.
71
72 If append is not empty, it and the words stripped from the
73 front are appended to the end.
74 """
75 string = self.unicode(string)
76 stripped = []
77 words = self.words(string)
78 while words and (words[0].isspace() or words[0].lower() in strip):
79 stripped.append(words.pop(0))
80 while stripped and stripped[-1].isspace():
81 stripped.pop()
82 if append and stripped:
83 if words:
84 words.append(append)
85 words.extend(stripped)
86 return u"".join(words)
87
88 def lstripsortemekey(
89 self, string, strip=collate.strings.INITIAL_STOPS, append=u", "):
90 """Return a key based on sortemes of a prefix-stripped string."""
91 string = self.unicode(string)
92 stripped = self.lstripwords(string, strip, append)
93 return (self.sortemekey(stripped), self.key(string))