aboutsummaryrefslogtreecommitdiff
path: root/modules/mixed_unicode.py
diff options
context:
space:
mode:
authorGravatar jesopo2019-09-02 18:28:40 +0100
committerGravatar jesopo2019-09-02 18:28:40 +0100
commit24a510d1cae893d6bce07b6216ec398937e4584c (patch)
tree79f158a9101988f10fac06d0dd1509ccd3506a7e /modules/mixed_unicode.py
parentremove strax.py - no one uses it (diff)
signature
move mixed_unicode.py to bitbot-modules
Diffstat (limited to 'modules/mixed_unicode.py')
-rw-r--r--modules/mixed_unicode.py87
1 files changed, 0 insertions, 87 deletions
diff --git a/modules/mixed_unicode.py b/modules/mixed_unicode.py
deleted file mode 100644
index 057b971a..00000000
--- a/modules/mixed_unicode.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import collections, enum
-from src import ModuleManager, utils
-
-class Script(enum.Enum):
- Unknown = 0
- Latin = 1
- Cyrillic = 2
- Greek = 3
- Armenian = 4
- FullWidth = 5
- Coptic = 6
- Cherokee = 7
- TaiLe = 8
-
-class ScoreReason(enum.Enum):
- ScriptChange = 0
- ScriptChangeInWord = 1
- AdditionalScript = 2
-
-WORD_SEPERATORS = [",", " ", "\t", "."]
-
-SCORE_LENGTH = 100
-
-class Module(ModuleManager.BaseModule):
- def _detect_script(self, char):
- point = ord(char)
- # NULL .. LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
- if 0x0000 <= point <= 0x02AF:
- return Script.Latin
- # GREEK CAPITAL LETTER HETA .. GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
- elif 0x0370 <= point <= 0x03ff:
- return Script.Greek
- # CYRILLIC CAPITAL LETTER IE WITH GRAVE .. CYRILLIC SMALL LETTER EL WITH DESCENDER
- elif 0x0400 <= point <= 0x052F:
- return Script.Cyrillic
- # ARMENIAN CAPITAL LETTER AYB .. ARMENIAN HYPHEN
- elif 0x0531 <= point <= 0x058A:
- return Script.Armenian
- # FULLWIDTH EXCLAMATION MARK .. FULLWIDTH RIGHT WHITE PARENTHESIS
- elif 0xFF01 <= point <= 0xff60:
- return Script.FullWidth
- # COPTIC CAPITAL LETTER ALFA .. COPTIC MORPHOLOGICAL DIVIDER
- elif 0x2C80 <= point <= 0x2CFF:
- return Script.Coptic
- # CHEROKEE LETTER A .. CHEROKEE SMALL LETTER MV
- elif 0x13A0 <= point <= 0x13FD:
- return Script.Cherokee
- # TAI LE LETTER KA .. U+197F
- elif 0x1950 <= point <= 0x197F:
- return Script.TaiLe
- return Script.Unknown
-
- @utils.hook("received.message.channel")
- def channel_message(self, event):
- last_script = None
- last_was_separator = False
- reasons = []
- scripts = set([])
-
- for char in event["message"]:
- if char in WORD_SEPERATORS:
- last_was_separator = True
- else:
- script = self._detect_script(char)
- if not script == Script.Unknown:
- scripts.add(script)
- if last_script and not script == last_script:
- reasons.append(ScoreReason.ScriptChange)
- if not last_was_separator:
- reasons.append(ScoreReason.ScriptChangeInWord)
-
- last_script = script
-
- last_was_separator = False
-
- if len(scripts) > 1:
- reasons.extend([ScoreReason.AdditionalScript]*(len(scripts)-1))
-
- score = len(reasons)
- if score > 0:
- reasons_s = []
- for reason, count in collections.Counter(reasons).items():
- reasons_s.append("%s: %s" % (reason, count))
-
- self.log.trace(
- "Message given a mixed-unicode score of %s (reasons: %s)",
- [score, ", ".join(reasons_s)])