aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar jesopo2018-11-20 11:38:30 +0000
committerGravatar jesopo2018-11-20 11:38:30 +0000
commitb19ce0be2f274e9e43f9a12128cfd210e289bf74 (patch)
treeb95920c2eb4bf9b04401b2de66a405f6995f44d9
parentBetter descript opened pull requests (github.py) (diff)
signature
Add first version of modules/mixed_unicode.py, designed to detect when we get a
message that mixes scripts (latin, cyrillic, greek, etc) that might be spam
-rw-r--r--modules/mixed_unicode.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/modules/mixed_unicode.py b/modules/mixed_unicode.py
new file mode 100644
index 00000000..d2ef796e
--- /dev/null
+++ b/modules/mixed_unicode.py
@@ -0,0 +1,42 @@
+import enum
+from src import ModuleManager, utils
+
+class Script(enum.Enum):
+ Unknown = 0
+ Latin = 1
+ Cyrillic = 2
+ Greek = 3
+WORD_SEPERATORS = [",", " ", "\t", "."]
+
+class Module(ModuleManager.BaseModule):
+ def _detect_script(self, char):
+ point = ord(char)
+ if 0 <= point <= 687:
+ return Script.Latin
+ elif 880 <= point <= 1023:
+ return Script.Greek
+ elif 1024 <= point <= 1327:
+ return Script.Cyrillic
+ return Script.Unknown
+
+ @utils.hook("received.message.channel")
+ def channel_message(self, event):
+ last_script = None
+ last_was_separator = False
+ score = 0
+
+ for char in event["message"]:
+ if char in WORD_SEPERATORS:
+ last_was_separator = True
+ else:
+ script = self._detect_script(char)
+ if not script == Script.Unknown:
+ if last_script and not script == last_script:
+ score += 1
+ if not last_was_separator:
+ score += 1
+
+ last_script = script
+
+ last_was_separator = False
+ self.log.trace("Message given a mixed-unicode score of %d", [score])