soxoj · soxoj · Nov 24, 2024 · Nov 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.maigret.json
+
+venv
+__pycache__
diff --git a/MAIGRET.md b/MAIGRET.md
@@ -0,0 +1,14 @@
+## Maigret Exporter
+
+Run Marple with the folloing parameters to export new sites to Maigret:
+
+```
+python3 marple.py text --plugins maigret extract_username maigret_export random_username
+```
+
+### TODO
+
+- [ ] Add an direct integration with Maigret (`--submit`)
+- [ ] Implement the GitHub API call to create an issue with parameters of a new site
+- [ ] Utilize AI to determine if a link is similar to an account page
+- [ ] Add a generation of tags for a website with AI
diff --git a/marple.py b/marple.py
@@ -11,6 +11,9 @@
 
 import aiohttp
 import requests
+import random
+import string
+import difflib
 import tqdm
 from aiohttp_socks import ProxyConnector
 from bs4 import BeautifulSoup as bs
@@ -35,6 +38,118 @@
     '/search?q=',
 ]
 
+def ai_generate_username():
+    url = "http://localhost:1234/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
+        "messages": [
+            {"role": "system", "content": "Always answer with a message contains only an answer, without any comments and explanations"},
+            {"role": "user", "content": "Give a random internet username"}
+        ],
+        "temperature": 0.7,
+        "max_tokens": -1,
+        "stream": False
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        username = response.json()["choices"][0]["message"]["content"]
+        username = username.strip('"')
+        return username
+    except:
+        raise Exception("The LLM AI endpoint is not available. Please, edit the settings of LLM API endpoint in the source code")
+
+def generate_random_username():
+    return ''.join(random.choices(string.ascii_lowercase, k=10))
+
+def maigret_exporter(link):
+    username = link.name
+    random_username = generate_random_username()
+    try:
+        first_html_response = requests.get(link.url).text
+        url_of_non_existing_account = link.url.lower().replace(username.lower(), random_username)
+        second_html_response = requests.get(url_of_non_existing_account).text
+    except Exception as e:
+        return None, None, str(e)
+
+    SEPARATORS = "\"'\n"
+    TOP_FEATURES = 5
+
+    tokens_a = set(re.split(f'[{SEPARATORS}]', first_html_response))
+    tokens_b = set(re.split(f'[{SEPARATORS}]', second_html_response))
+
+    a_minus_b = tokens_a.difference(tokens_b)
+    b_minus_a = tokens_b.difference(tokens_a)
+
+    a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b))
+    b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a))
+
+    # Filter out strings containing usernames
+    a_minus_b = [s for s in a_minus_b if username not in s]
+    b_minus_a = [s for s in b_minus_a if random_username not in s]
+
+    if len(a_minus_b) == len(b_minus_a) == 0:
+        return None, None, "HTML responses are the same"
+
+    presence_strings = [
+        "username",
+        "not found",
+        "пользователь",
+        "profile",
+        "lastname",
+        "firstname",
+        "biography",
+        "birthday",
+        "репутация",
+        "информация",
+        "e-mail"
+    ]
+
+    def get_match_ratio(base_strs: list):
+        def get_match_inner(s: str):
+            return round(
+                max(
+                    [
+                        difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
+                        for s2 in base_strs
+                    ]
+                ),
+                2,
+            )
+        return get_match_inner
+
+    match_fun = get_match_ratio(presence_strings)
+
+    presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[:TOP_FEATURES]
+    absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[:TOP_FEATURES]
+
+    return presence_list, absence_list, "Found"
+
+def extract_username_from_url(site_url):
+    url = "http://localhost:1234/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
+        "messages": [
+            # {"role": "system", "content": "Always answer with a message contains only an answer (one word), without any comments and explanations"},
+            {"role": "user", "content": f"Extract the username from the URL: {site_url}. The username is the part of the URL that comes immediately after the last '/' separator and may include '.', '-', and '_' as valid characters. The username should exclude any prefixes like 'http', 'https', 'www', or any trailing query parameters ('?') or fragments ('#'). Symbols '.', '-', and '_' must be treated as integral parts of the username and not removed or modified. Answer with a username only, which can be a combination of segments separated by '.', '-', and '_'."},
+        ],
+        "n_ctx": 2048,
+        "temperature": 0.8,
+        "max_tokens": -1,
+        "stream": False
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        return response.json()["choices"][0]["message"]["content"]
+    except:
+        raise Exception("The LLM AI endpoint is not available. Please, edit the settings of LLM API endpoint in the source code")
 
 class Link:
     url: str
@@ -547,7 +662,7 @@ def main():
         dest='plugins',
         nargs='+',
         default='',
-        choices={'maigret', 'socid_extractor', 'metadata'},
+        choices={'maigret', 'socid_extractor', 'metadata', 'random_username', 'extract_username', 'maigret_export'},
         help='Additional plugins to analyze links',
     )
     parser.add_argument(
@@ -585,6 +700,11 @@ def main():
     )
     args = parser.parse_args()
 
+    if args.plugins and 'random_username' in args.plugins:
+        new_username = ai_generate_username()
+        print(colored(f'[random_username] AI-generated username "{new_username}" will be used for search instead of "{args.name}"', 'green'))
+        args.name = new_username
+
     username = args.name
     if " " in username:
         print(colored('Warning, search by firstname+lastname '
@@ -640,6 +760,10 @@ def main():
     def is_likely_profile(r):
         return r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered
 
+    junk_scores = [r.junk_score for r in result.unique_links]
+    medium_junk_score = sorted(junk_scores)[len(junk_scores)//2] if junk_scores else 0
+    average_junk_score = sum(junk_scores)/len(junk_scores) if junk_scores else 0
+
     # reliable links section
     for r in result.unique_links:
         if is_likely_profile(r):
@@ -651,12 +775,28 @@ def is_likely_profile(r):
                 message = colored(f'[{r.junk_score}]', 'magenta') + ' ' + \
                           colored(f'[{r.source}]', 'green') + ' ' + message
 
+            maigret_found = False
             if 'maigret' in args.plugins and maigret.db:
+                main_url = r.url.replace(args.name, '')
+
+                if os.path.exists('.maigret.json'):
+                    urls = json.load(open('.maigret.json'))
+                else:
+                    urls = []
+
+                if main_url in urls:
+                    message += colored(' [v] Local findings', 'green')
+
                 if maigret.db.extract_ids_from_url(r.url):
                     message += colored(' [v] Maigret', 'green')
+                    maigret_found = True
                 else:
                     message += colored(' [ ] Maigret', 'yellow')
 
+                urls.append(main_url)
+                with open('.maigret.json', 'w') as f:
+                    json.dump(urls, f, indent=4)
+
             if 'socid_extractor' in args.plugins:
                 try:
                     req = requests.get(r.url)
@@ -666,7 +806,32 @@ def is_likely_profile(r):
                 except Exception as e:
                     print(colored(e, 'red'))
 
-            print(f'{message}\n{r.title}\n')
+            message += f'\n{colored("Title:", "cyan")} {r.title}'
+
+            if 'extract_username' in args.plugins:
+                guessed_username = extract_username_from_url(r.url)
+                # workaround for the case when an AI response contains a comment
+                guessed_username = guessed_username.split()[-1]
+
+                comment = ""
+                if not guessed_username.lower() in r.url.lower():
+                    comment = colored(" Invalid", "red")
+                message += colored("\n[extract_username] Username guessed by AI: ", 'cyan') + guessed_username + comment
+
+                if 'maigret_export' in args.plugins:
+                    if maigret_found:
+                        message += colored("\n[maigret_exporter] The site was already found in Maigret, skipping...", 'yellow')
+                    else:
+                        keywords = maigret_exporter(r)
+                        if keywords[2] != "Found":
+                            message += colored(f"\n[maigret_exporter] No keywords found: {keywords[2]}", 'yellow')
+                        else:
+                            presence_strings = keywords[0]
+                            absence_strings = keywords[1]
+                            message += colored("\n[maigret_exporter] Presense keywords for Maigret: ", 'yellow') + ', '.join(presence_strings)
+                            message += colored("\n[maigret_exporter] Absence keywords for Maigret: ", 'yellow') + ', '.join(absence_strings)
+
+            print(f'{colored("URL:", "cyan")} {message}\n')
 
     pdf_count = 0
 
@@ -724,6 +889,11 @@ def is_pdf_file(url):
 
     print(f"{colored(status_msg, 'cyan')}\n{colored(error_msg, 'yellow')}")
 
+    if displayed_count == 0 and uniq_count > 20:
+        print(colored('\nNo reliable links filtered, although there are more than 20 unique links.', 'red'))
+        print(colored(f'Try to decrease threshold with -t option ({args.threshold} at the moment).', 'red'))
+        print(colored(f'Junk scores: medium {medium_junk_score:.1f} / average {average_junk_score:.1f}\n', 'red'))
+
     if args.csv:
         with open(args.csv, 'w', newline='', encoding='utf-8') as csvfile:
             writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,93 @@
-aiohttp>=3.8.0
-termcolor>=2.0.0
-beautifulsoup4>=4.9.0
-requests>=2.25.0
-yandex-search>=0.3.2
-PyPDF2>=2.0.0
-socid-extractor>=0.0.1
-aiohttp-socks>=0.7.0
-tqdm>=4.65.0
-google-search-results>=2.4.0
-mock>=4.0.0
-arabic-reshaper>=2.1.4
-maigret @ https://github.com/soxoj/maigret/archive/refs/heads/master.zip
-search-engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip
+aiodns==3.2.0
+aiohappyeyeballs==2.4.3
+aiohttp==3.11.7
+aiohttp-socks==0.7.1
+aiosignal==1.3.1
+arabic-reshaper==3.0.0
+asn1crypto==1.5.1
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==22.2.0
+beautifulsoup4==4.12.3
+bs4==0.0.2
+certifi==2024.8.30
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudscraper==1.2.71
+colorama==0.4.6
+cryptography==43.0.3
+cssselect2==0.7.0
+decorator==5.1.1
+executing==2.1.0
+frozenlist==1.5.0
+future==1.0.0
+future-annotations==1.0.0
+google_search_results==2.4.2
+html5lib==1.1
+idna==3.10
+ipython==8.29.0
+jedi==0.19.2
+Jinja2==3.1.4
+jsonpickle==4.0.0
+lxml==5.3.0
+maigret @ file:///Users/account/work/maigret
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mock==4.0.3
+multidict==6.1.0
+networkx==2.8.8
+oscrypto==1.3.0
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.0.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycares==4.4.0
+pycountry==23.12.11
+pycparser==2.22
+Pygments==2.18.0
+pyHanko==0.25.3
+pyhanko-certvalidator==0.26.5
+pyparsing==3.2.0
+pypdf==5.1.0
+PyPDF2==3.0.1
+PySocks==1.7.1
+python-bidi==0.4.2
+python-dateutil==2.9.0.post0
+python-socks==2.5.3
+pyvis==0.2.1
+PyYAML==6.0.2
+qrcode==8.0
+reportlab==4.2.5
+requests==2.32.3
+requests-futures==1.0.2
+requests-toolbelt==1.0.0
+search_engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip#sha256=329c8a1aff702ced584e5a9f75663d6759104628df8634a740435043cb199ec0
+setuptools==75.6.0
+six==1.16.0
+socid-extractor==0.0.26
+soupsieve==2.6
+stack-data==0.6.3
+stem==1.8.2
+svglib==1.5.1
+termcolor==2.5.0
+tinycss2==1.4.0
+tokenize_rt==6.1.0
+torrequest==0.1.0
+tqdm==4.67.0
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzlocal==5.2
+uritools==4.0.3
+urllib3==2.2.3
+wcwidth==0.2.13
+webencodings==0.5.1
+wheel==0.45.1
+xhtml2pdf==0.2.16
+XMind==1.2.0
+yandex-search==0.3.2
+yarl==1.18.0