-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcapture.py
117 lines (89 loc) · 4.25 KB
/
capture.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time
import os
import random
def normalize_url(url):
return url.rstrip('/')
def should_ignore_url(url):
return any(ignore_term in url for ignore_term in ['.php', '/wp-json/', '/wp-admin/'])
def crawl_website(domain, sleep_time_range):
crawled_urls = set()
to_crawl = [f'https://{normalize_url(domain)}']
if not os.path.exists('./captures'):
os.makedirs('./captures')
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
txt_file = f'./captures/{domain}-capture-{timestamp}.txt'
html_file = f'./captures/{domain}-capture-{timestamp}.html'
print(f"Generating capture files: {txt_file}, {html_file}")
with open(txt_file, 'w') as f:
f.write("Type\tURL\tStatus_Code\tSize\tHeight\n")
# Setting up HTML with Bootstrap styling
with open(html_file, 'w') as f:
f.write("""
<html>
<head>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
</head>
<body>
<div class="container mt-4">
<h1>All URLs</h1>
<table class='table table-bordered table-striped'>
<tr>
<th>Type</th><th>URL</th><th>Status Code</th><th>Size</th><th>Height</th>
</tr>
""")
while to_crawl:
url = to_crawl.pop(0)
normalized_url = normalize_url(url)
if should_ignore_url(normalized_url):
continue
if normalized_url not in crawled_urls:
print(f"Crawling: {normalized_url}")
try:
response = requests.get(normalized_url)
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
continue
crawled_urls.add(normalized_url)
content_type = response.headers.get('Content-Type', '')
status_code = response.status_code
page_size = len(response.content)
page_height = ''
if 'text/html' in content_type:
soup = BeautifulSoup(response.text, 'html.parser')
page_height = len(str(soup.find('body')).split('\n'))
for tag_name, attr_name in [('a', 'href'), ('script', 'src'), ('link', 'href')]:
for tag in soup.find_all(tag_name, **{attr_name: True}):
new_url = urljoin(normalized_url, tag[attr_name])
new_url = normalize_url(new_url)
parsed_url = urlparse(new_url)
if parsed_url.netloc == domain and not parsed_url.fragment:
if not should_ignore_url(new_url):
to_crawl.append(new_url)
content_type_label = 'HTML'
elif 'text/css' in content_type or 'application/javascript' in content_type:
content_type_label = 'Asset'
elif 'image/' in content_type:
content_type_label = 'Image'
else:
content_type_label = 'Other'
with open(txt_file, 'a') as f:
f.write(f"{content_type_label}\t{normalized_url}\t{status_code}\t{page_size}\t{page_height}\n")
# Using Bootstrap styled table rows
with open(html_file, 'a') as f:
f.write(f"<tr><td>{content_type_label}</td><td><a href='{normalized_url}' target='_blank'>{normalized_url}</a></td><td>{status_code}</td><td>{page_size}</td><td>{page_height}</td></tr>")
sleep_time = random.uniform(*sleep_time_range)
time.sleep(sleep_time)
with open(html_file, 'a') as f:
f.write("</table></div></body></html>\n")
print("Crawl complete. Capture files generated.")
if __name__ == "__main__":
domain = input("Enter the domain of the website to crawl (e.g., example.com): ")
print("Select the type of crawl:")
print("1. Aggressive (Sleep time between 100ms-500ms)")
print("2. Careful (Sleep time between 500ms-3000ms)")
crawl_type_selection = input("Enter the number corresponding to your choice: ").strip()
sleep_time_range = (0.1, 0.5) if crawl_type_selection == '1' else (0.5, 3)
crawl_website(domain, sleep_time_range)