aboutsummaryrefslogtreecommitdiff
path: root/modules/mixed_unicode.py
blob: c03dbd1c0823f8df30e46f9f272c96e3ae8b1c58 (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import enum, itertools
from src import ModuleManager, utils

class Script(enum.Enum):
    Unknown = 0
    Latin = 1
    Cyrillic = 2
    Greek = 3
    Armenian = 4
    FullWidth = 5
    Coptic = 6
    Cherokee = 7
    TaiLe = 8

class ScoreReason(enum.Enum):
    ScriptChange = 0
    ScriptChangeInWord = 1
    AdditionalScript = 2

WORD_SEPERATORS = [",", " ", "\t", "."]

SCORE_LENGTH = 100

class Module(ModuleManager.BaseModule):
    def _detect_script(self, char):
        point = ord(char)
        # NULL .. LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
        if   0x0000 <= point <= 0x02AF:
            return Script.Latin
        # GREEK CAPITAL LETTER HETA .. GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
        elif 0x0370 <= point <= 0x03ff:
            return Script.Greek
        # CYRILLIC CAPITAL LETTER IE WITH GRAVE .. CYRILLIC SMALL LETTER EL WITH DESCENDER
        elif 0x0400 <= point <= 0x052F:
            return Script.Cyrillic
        # ARMENIAN CAPITAL LETTER AYB .. ARMENIAN HYPHEN
        elif 0x0531 <= point <= 0x058A:
            return Script.Armenian
        # FULLWIDTH EXCLAMATION MARK .. FULLWIDTH RIGHT WHITE PARENTHESIS
        elif 0xFF01 <= point <= 0xff60:
            return Script.FullWidth
        # COPTIC CAPITAL LETTER ALFA .. COPTIC MORPHOLOGICAL DIVIDER
        elif 0x2C80 <= point <= 0x2CFF:
            return Script.Coptic
        # CHEROKEE LETTER A .. CHEROKEE SMALL LETTER MV
        elif 0x13A0 <= point <= 0x13FD:
            return Script.Cherokee
        # TAI LE LETTER KA .. U+197F
        elif 0x1950 <= point <= 0x197F:
            return Script.TaiLe
        return Script.Unknown

    @utils.hook("received.message.channel")
    def channel_message(self, event):
        last_script = None
        last_was_separator = False
        reasons = []
        scripts = set([])

        for char in event["message"]:
            if char in WORD_SEPERATORS:
                last_was_separator = True
            else:
                script = self._detect_script(char)
                if not script == Script.Unknown:
                    scripts.add(script)
                    if last_script and not script == last_script:
                        reasons.append(ScoreReason.ScriptChange)
                        if not last_was_separator:
                            reasons.append(ScoreReason.ScriptChangeInWord)

                    last_script = script

                last_was_separator = False

        if len(scripts) > 1:
            reasons.extend([ScoreReason.AdditionalScript]*(len(scripts)-1))

        score = len(reasons)
        reasons_s = []
        for reason, group in itertools.groupby(reasons):
            reasons_s.append("%s: %s" % (reason, len(list(group))))

        if score > 0:
            self.log.trace(
                "Message given a mixed-unicode score of %s (reasons: %s)",
                [score, ", ".join(reasons_s)])