projects
/
python-collate.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Category-based splitting.
[python-collate.git]
/
collate
/
_strings.py
diff --git
a/collate/_strings.py
b/collate/_strings.py
index
f81bfd7
..
d872ed4
100644
(file)
--- a/
collate/_strings.py
+++ b/
collate/_strings.py
@@
-1,9
+1,12
@@
import unicodedata
import unicodedata
-def strip_nonalnum(string):
- while string and not (string[0].isalpha() or string[0].isnumeric()):
+def strip_punc(string):
+ return filter(lambda c: unicodedata.category(c)[0] not in "PS", string)
+
+def strip_ends(string):
+ while string and unicodedata.category(string[0])[0] in "ZPS":
string = string[1:]
string = string[1:]
- while string and
not (string[-1].isalpha() or string[-1].isnumeric())
:
+ while string and
unicodedata.category(string[-1])[0] in "ZPS"
:
string = string[:-1]
return string
string = string[:-1]
return string
@@
-15,6
+18,7
@@
def alnumsplit(string):
numeric = None
start = 0
for i, char in enumerate(string):
numeric = None
start = 0
for i, char in enumerate(string):
+ category = unicodedata.category(char)
if numeric is None:
broke = False
if char.isnumeric():
if numeric is None:
broke = False
if char.isnumeric():
@@
-24,17
+28,17
@@
def alnumsplit(string):
elif numeric and char.isalpha():
broke = True
numeric = False
elif numeric and char.isalpha():
broke = True
numeric = False
- elif numeric and c
har.isspace()
:
+ elif numeric and c
ategory in ["Zs", "Ps", "Pe"]
:
broke = True
numeric = None
elif not numeric and char.isnumeric():
broke = True
numeric = True
if broke:
broke = True
numeric = None
elif not numeric and char.isnumeric():
broke = True
numeric = True
if broke:
- strings.append(strip_
nonalnum
(string[start:i]))
+ strings.append(strip_
ends
(string[start:i]))
start = i
broke = False
start = i
broke = False
- strings.append(strip_
nonalnum
(string[start:i + 1]))
+ strings.append(strip_
ends
(string[start:i + 1]))
return strings
def wordlike(string):
return strings
def wordlike(string):
@@
-85,7
+89,7
@@
def numeric(orig, invalid=float('inf')):
total = 0
for c in string:
v = unicodedata.numeric(c)
total = 0
for c in string:
v = unicodedata.numeric(c)
- if v >= 1:
+ if v >= 1
or v == 0
:
total *= 10
total += v
return total
total *= 10
total += v
return total