aboutsummaryrefslogtreecommitdiff
path: root/modules/mixed_unicode.py
blob: c2e3ba41559dd8655917438cbb67473914acaea4 (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import enum
from src import ModuleManager, utils

class Script(enum.Enum):
    Unknown = 0
    Latin = 1
    Cyrillic = 2
    Greek = 3
    Armenian = 4
    FullWidth = 5
    Coptic = 6
WORD_SEPERATORS = [",", " ", "\t", "."]

class Module(ModuleManager.BaseModule):
    def _detect_script(self, char):
        point = ord(char)
        if   0     <= point <= 687:
            return Script.Latin
        elif 880   <= point <= 1023:
            return Script.Greek
        elif 1024  <= point <= 1327:
            return Script.Cyrillic
        elif 1329  <= point <= 1418:
            return Script.Armenian
        elif 65281 <= point <= 65376:
            return Script.FullWidth
        # COPTIC CAPITAL LETTER ALFA .. COPTIC MORPHOLOGICAL DIVIDER
        elif 0x2C80 <= point <= 0x2CFF:
            return Script.Coptic
        return Script.Unknown

    @utils.hook("received.message.channel")
    def channel_message(self, event):
        last_script = None
        last_was_separator = False
        score = 0

        for char in event["message"]:
            if char in WORD_SEPERATORS:
                last_was_separator = True
            else:
                script = self._detect_script(char)
                if not script == Script.Unknown:
                    if last_script and not script == last_script:
                        score += 1
                        if not last_was_separator:
                            score += 1

                    last_script = script

                last_was_separator = False
        self.log.trace("Message given a mixed-unicode score of %d", [score])