Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AI integration with Maigret, see MAIGRET.md #25

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.maigret.json

venv
__pycache__
14 changes: 14 additions & 0 deletions MAIGRET.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Maigret Exporter

Run Marple with the folloing parameters to export new sites to Maigret:

```
python3 marple.py text --plugins maigret extract_username maigret_export random_username
```

### TODO

- [ ] Add an direct integration with Maigret (`--submit`)
- [ ] Implement the GitHub API call to create an issue with parameters of a new site
- [ ] Utilize AI to determine if a link is similar to an account page
- [ ] Add a generation of tags for a website with AI
174 changes: 172 additions & 2 deletions marple.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

import aiohttp
import requests
import random
import string
import difflib
import tqdm
from aiohttp_socks import ProxyConnector
from bs4 import BeautifulSoup as bs
Expand All @@ -35,6 +38,118 @@
'/search?q=',
]

def ai_generate_username():
url = "http://localhost:1234/v1/chat/completions"
headers = {
"Content-Type": "application/json"
}
data = {
"model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
"messages": [
{"role": "system", "content": "Always answer with a message contains only an answer, without any comments and explanations"},
{"role": "user", "content": "Give a random internet username"}
],
"temperature": 0.7,
"max_tokens": -1,
"stream": False
}

try:
response = requests.post(url, headers=headers, json=data)
username = response.json()["choices"][0]["message"]["content"]
username = username.strip('"')
return username
except:
raise Exception("The LLM AI endpoint is not available. Please, edit the settings of LLM API endpoint in the source code")

def generate_random_username():
return ''.join(random.choices(string.ascii_lowercase, k=10))

def maigret_exporter(link):
username = link.name
random_username = generate_random_username()
try:
first_html_response = requests.get(link.url).text
url_of_non_existing_account = link.url.lower().replace(username.lower(), random_username)
second_html_response = requests.get(url_of_non_existing_account).text
except Exception as e:
return None, None, str(e)

SEPARATORS = "\"'\n"
TOP_FEATURES = 5

tokens_a = set(re.split(f'[{SEPARATORS}]', first_html_response))
tokens_b = set(re.split(f'[{SEPARATORS}]', second_html_response))

a_minus_b = tokens_a.difference(tokens_b)
b_minus_a = tokens_b.difference(tokens_a)

a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b))
b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a))

# Filter out strings containing usernames
a_minus_b = [s for s in a_minus_b if username not in s]
b_minus_a = [s for s in b_minus_a if random_username not in s]

if len(a_minus_b) == len(b_minus_a) == 0:
return None, None, "HTML responses are the same"

presence_strings = [
"username",
"not found",
"пользователь",
"profile",
"lastname",
"firstname",
"biography",
"birthday",
"репутация",
"информация",
"e-mail"
]

def get_match_ratio(base_strs: list):
def get_match_inner(s: str):
return round(
max(
[
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
for s2 in base_strs
]
),
2,
)
return get_match_inner

match_fun = get_match_ratio(presence_strings)

presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[:TOP_FEATURES]
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[:TOP_FEATURES]

return presence_list, absence_list, "Found"

def extract_username_from_url(site_url):
url = "http://localhost:1234/v1/chat/completions"
headers = {
"Content-Type": "application/json"
}
data = {
"model": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
"messages": [
# {"role": "system", "content": "Always answer with a message contains only an answer (one word), without any comments and explanations"},
{"role": "user", "content": f"Extract the username from the URL: {site_url}. The username is the part of the URL that comes immediately after the last '/' separator and may include '.', '-', and '_' as valid characters. The username should exclude any prefixes like 'http', 'https', 'www', or any trailing query parameters ('?') or fragments ('#'). Symbols '.', '-', and '_' must be treated as integral parts of the username and not removed or modified. Answer with a username only, which can be a combination of segments separated by '.', '-', and '_'."},
],
"n_ctx": 2048,
"temperature": 0.8,
"max_tokens": -1,
"stream": False
}

try:
response = requests.post(url, headers=headers, json=data)
return response.json()["choices"][0]["message"]["content"]
except:
raise Exception("The LLM AI endpoint is not available. Please, edit the settings of LLM API endpoint in the source code")

class Link:
url: str
Expand Down Expand Up @@ -547,7 +662,7 @@ def main():
dest='plugins',
nargs='+',
default='',
choices={'maigret', 'socid_extractor', 'metadata'},
choices={'maigret', 'socid_extractor', 'metadata', 'random_username', 'extract_username', 'maigret_export'},
help='Additional plugins to analyze links',
)
parser.add_argument(
Expand Down Expand Up @@ -585,6 +700,11 @@ def main():
)
args = parser.parse_args()

if args.plugins and 'random_username' in args.plugins:
new_username = ai_generate_username()
print(colored(f'[random_username] AI-generated username "{new_username}" will be used for search instead of "{args.name}"', 'green'))
args.name = new_username

username = args.name
if " " in username:
print(colored('Warning, search by firstname+lastname '
Expand Down Expand Up @@ -640,6 +760,10 @@ def main():
def is_likely_profile(r):
return r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered

junk_scores = [r.junk_score for r in result.unique_links]
medium_junk_score = sorted(junk_scores)[len(junk_scores)//2] if junk_scores else 0
average_junk_score = sum(junk_scores)/len(junk_scores) if junk_scores else 0

# reliable links section
for r in result.unique_links:
if is_likely_profile(r):
Expand All @@ -651,12 +775,28 @@ def is_likely_profile(r):
message = colored(f'[{r.junk_score}]', 'magenta') + ' ' + \
colored(f'[{r.source}]', 'green') + ' ' + message

maigret_found = False
if 'maigret' in args.plugins and maigret.db:
main_url = r.url.replace(args.name, '')

if os.path.exists('.maigret.json'):
urls = json.load(open('.maigret.json'))
else:
urls = []

if main_url in urls:
message += colored(' [v] Local findings', 'green')

if maigret.db.extract_ids_from_url(r.url):
message += colored(' [v] Maigret', 'green')
maigret_found = True
else:
message += colored(' [ ] Maigret', 'yellow')

urls.append(main_url)
with open('.maigret.json', 'w') as f:
json.dump(urls, f, indent=4)

if 'socid_extractor' in args.plugins:
try:
req = requests.get(r.url)
Expand All @@ -666,7 +806,32 @@ def is_likely_profile(r):
except Exception as e:
print(colored(e, 'red'))

print(f'{message}\n{r.title}\n')
message += f'\n{colored("Title:", "cyan")} {r.title}'

if 'extract_username' in args.plugins:
guessed_username = extract_username_from_url(r.url)
# workaround for the case when an AI response contains a comment
guessed_username = guessed_username.split()[-1]

comment = ""
if not guessed_username.lower() in r.url.lower():
comment = colored(" Invalid", "red")
message += colored("\n[extract_username] Username guessed by AI: ", 'cyan') + guessed_username + comment

if 'maigret_export' in args.plugins:
if maigret_found:
message += colored("\n[maigret_exporter] The site was already found in Maigret, skipping...", 'yellow')
else:
keywords = maigret_exporter(r)
if keywords[2] != "Found":
message += colored(f"\n[maigret_exporter] No keywords found: {keywords[2]}", 'yellow')
else:
presence_strings = keywords[0]
absence_strings = keywords[1]
message += colored("\n[maigret_exporter] Presense keywords for Maigret: ", 'yellow') + ', '.join(presence_strings)
message += colored("\n[maigret_exporter] Absence keywords for Maigret: ", 'yellow') + ', '.join(absence_strings)

print(f'{colored("URL:", "cyan")} {message}\n')

pdf_count = 0

Expand Down Expand Up @@ -724,6 +889,11 @@ def is_pdf_file(url):

print(f"{colored(status_msg, 'cyan')}\n{colored(error_msg, 'yellow')}")

if displayed_count == 0 and uniq_count > 20:
print(colored('\nNo reliable links filtered, although there are more than 20 unique links.', 'red'))
print(colored(f'Try to decrease threshold with -t option ({args.threshold} at the moment).', 'red'))
print(colored(f'Junk scores: medium {medium_junk_score:.1f} / average {average_junk_score:.1f}\n', 'red'))

if args.csv:
with open(args.csv, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
Expand Down
107 changes: 93 additions & 14 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,93 @@
aiohttp>=3.8.0
termcolor>=2.0.0
beautifulsoup4>=4.9.0
requests>=2.25.0
yandex-search>=0.3.2
PyPDF2>=2.0.0
socid-extractor>=0.0.1
aiohttp-socks>=0.7.0
tqdm>=4.65.0
google-search-results>=2.4.0
mock>=4.0.0
arabic-reshaper>=2.1.4
maigret @ https://github.com/soxoj/maigret/archive/refs/heads/master.zip
search-engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip
aiodns==3.2.0
aiohappyeyeballs==2.4.3
aiohttp==3.11.7
aiohttp-socks==0.7.1
aiosignal==1.3.1
arabic-reshaper==3.0.0
asn1crypto==1.5.1
asttokens==2.4.1
async-timeout==4.0.3
attrs==22.2.0
beautifulsoup4==4.12.3
bs4==0.0.2
certifi==2024.8.30
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.0
click==8.1.7
cloudscraper==1.2.71
colorama==0.4.6
cryptography==43.0.3
cssselect2==0.7.0
decorator==5.1.1
executing==2.1.0
frozenlist==1.5.0
future==1.0.0
future-annotations==1.0.0
google_search_results==2.4.2
html5lib==1.1
idna==3.10
ipython==8.29.0
jedi==0.19.2
Jinja2==3.1.4
jsonpickle==4.0.0
lxml==5.3.0
maigret @ file:///Users/account/work/maigret
MarkupSafe==2.1.5
matplotlib-inline==0.1.7
mock==4.0.3
multidict==6.1.0
networkx==2.8.8
oscrypto==1.3.0
parso==0.8.4
pexpect==4.9.0
pillow==11.0.0
prompt_toolkit==3.0.48
propcache==0.2.0
ptyprocess==0.7.0
pure_eval==0.2.3
pycares==4.4.0
pycountry==23.12.11
pycparser==2.22
Pygments==2.18.0
pyHanko==0.25.3
pyhanko-certvalidator==0.26.5
pyparsing==3.2.0
pypdf==5.1.0
PyPDF2==3.0.1
PySocks==1.7.1
python-bidi==0.4.2
python-dateutil==2.9.0.post0
python-socks==2.5.3
pyvis==0.2.1
PyYAML==6.0.2
qrcode==8.0
reportlab==4.2.5
requests==2.32.3
requests-futures==1.0.2
requests-toolbelt==1.0.0
search_engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip#sha256=329c8a1aff702ced584e5a9f75663d6759104628df8634a740435043cb199ec0
setuptools==75.6.0
six==1.16.0
socid-extractor==0.0.26
soupsieve==2.6
stack-data==0.6.3
stem==1.8.2
svglib==1.5.1
termcolor==2.5.0
tinycss2==1.4.0
tokenize_rt==6.1.0
torrequest==0.1.0
tqdm==4.67.0
traitlets==5.14.3
typing_extensions==4.12.2
tzlocal==5.2
uritools==4.0.3
urllib3==2.2.3
wcwidth==0.2.13
webencodings==0.5.1
wheel==0.45.1
xhtml2pdf==0.2.16
XMind==1.2.0
yandex-search==0.3.2
yarl==1.18.0