diff options
| author | 2019-10-04 11:20:35 +0100 | |
|---|---|---|
| committer | 2019-10-04 11:20:35 +0100 | |
| commit | 8e4c0f4963841b6dd31d11ba46335de231e61467 (patch) | |
| tree | 0d0f8aa28d0d5194c5889daefdba405ad3450277 | |
| parent | don't auto-title when a URL contains most of it's <title> (diff) | |
| signature | ||
ignore one-char "words" in <title> if they're not a "letter"
| -rw-r--r-- | modules/title.py | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/modules/title.py b/modules/title.py index d23696e6..b3692de3 100644 --- a/modules/title.py +++ b/modules/title.py @@ -21,14 +21,19 @@ class Module(ModuleManager.BaseModule): def _different(self, url, title): url = url.lower() - title_words = [word.lower() for word in title.split()] + title_words = [] + for title_word in title.split(): + if len(title_word) > 1 or title_word.isalpha(): + title_words.append(title_word.lower()) + present = 0 for title_word in title_words: if title_word in url: present += 1 + similarity = present/len(title_words) # if at least 80% of words are in the URL, too similar - if (present/len(title_words)) >= 0.8: + if similarity >= 0.8: return False return True |
