This repository has been archived by the owner on Dec 17, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 137
/
third_parties.js
208 lines (166 loc) · 5.89 KB
/
third_parties.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
'use strict';
// Load in known third party service names.
const fs = require('fs');
const path = require('path');
const knownPath = path.join(__dirname, '..', 'utils', 'known_services.json');
const known_services = JSON.parse(fs.readFileSync(knownPath, 'utf8'));
// Used to parse third party hostnames.
const URL = require('url');
// TEST_LOCAL will turn on debug output.
// TODO: Allow --debug to turn on debug output from CLI/Python-land.
// TODO: Move logging functions into base.js where possible.
var debug = false;
if (process.env.TEST_LOCAL) debug = true;
// Default overall timeout, in seconds.
// TODO: make timeout calculation way more sophisticated. :)
// TODO: Move timeout management into base.js where possible.
var default_timeout = 20;
// JS entry point for third party scan.
module.exports = {
scan: async (domain, environment, options, browser, page) => {
const url = environment.url;
var data = {
url: url,
external_domains: [],
external_urls: [],
internal_domains: [],
internal_urls: [],
nearby_urls: [],
nearby_domains: [],
known_services: [],
unknown_services: [],
page_urls: [],
page_domains: []
};
// Trap each outgoing HTTP request to examine the URL.
page.on('request', (request) => {
processUrl(request.url(), url, data);
});
// Override puppeteer default of 30, especially since that
// causes Lambda execution itself to timeout and halt.
page.setDefaultNavigationTimeout(default_timeout * 1000);
try {
await page.goto(url);
} catch (exc) {
// if it's a timeout, that's okay, send back what we got.
if (exc.message.includes("Navigation Timeout Exceeded"))
return data;
// otherwise, re-throw and handle higher up.
else throw exc;
}
// find all the URLs/domains on the page
const html = await page.content();
data.page_urls = pageurls(html);
const allpagedomains = data.page_urls.map(getDomainFromURL);
data.page_domains = [...new Set(allpagedomains)];
// TODO: make smarter use of timeouts and events to decide 'done'
return data;
}
};
var pageurls = (html) => {
var urlRegex =/(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;
return [...new Set(html.match(urlRegex))];
}
var getDomainFromURL = (href) => {
var url = URL.parse(href);
return url.hostname;
}
var processUrl = (href, sourceHref, data) => {
if (debug) console.log("URI: " + href);
// Ignore blob: and data: URIs, these do not generate an external request.
// Catch them before running URL.parse(), since they are not URLs and the
// URL.parse() function does not parse them correctly.
var abort = false;
["data:", "blob:"].forEach(function(protocol) {
if (href.toLowerCase().startsWith(protocol)) abort = true;
});
if (abort) return;
var url = URL.parse(href);
var source = URL.parse(sourceHref);
// Ignore the original request to the page itself.
if (href == sourceHref) return;
let www_host, root_host;
// Isolate the hostname with or without a www prefix,
// and treat them effectively as the same hostname.
if (url.hostname.startsWith("www.")) {
www_host = url.hostname;
root_host = www_host.replace(/^www\./, "");
} else {
www_host = "www." + url.hostname;
root_host = url.hostname;
}
var base_host = baseDomainFor(root_host);
var source_base = baseDomainFor(source.hostname);
/***
* There are 4 cases:
* - internal: same hostname (or with a www prefix) as the source URL.
* - nearby: same base domain, different (non-www) subdomain as source URL.
* - affiliated: known to be affiliated in some way. (TBD)
* - external: different base domain from source URL.
***/
var hostType;
// Case 1: internal
if (
(www_host == source.hostname) ||
(root_host == source.hostname)
) {
hostType = "internal";
if (!data.internal_urls.includes(href))
data.internal_urls.push(href);
// Log www and root requests separately. They are only treated
// the same when calculating internal-ness, as they can still have
// different technical ramifications.
if (!data.internal_domains.includes(url.hostname))
data.internal_domains.push(url.hostname);
}
// Case 2: nearby
else if (base_host == source_base) {
hostType = "nearby";
if (!data.nearby_urls.includes(href))
data.nearby_urls.push(href);
if (!data.nearby_domains.includes(url.hostname))
data.nearby_domains.push(url.hostname);
}
// TODO: Case 3: affiliated
// Allow additional provided affiliated suffixes.
// For example, allow ".gov" to be considered affiliated.
// Case 4: external
else {
hostType = "external";
if (!data.external_urls.includes(href))
data.external_urls.push(href);
if (!data.external_domains.includes(url.hostname))
data.external_domains.push(url.hostname);
}
// Check every URL (even internal/nearby/affiliated ones)
// against the list of known services.
var known = false;
for (var name in known_services) {
var services = known_services[name];
for (var service of services) {
// Either an exact match, or can share a suffix with a known
// service-owned hostname.
if (
(www_host == service) ||
(root_host == service) ||
(root_host.endsWith(service))
) {
if (!data.known_services.includes(name))
data.known_services.push(name);
known = true;
break;
}
}
}
// Specifically call out unknown external services for research.
if (!known && (hostType == "external")) {
if (!data.unknown_services.includes(url.hostname))
data.unknown_services.push(url.hostname);
}
};
// For now, a naive base domain calculation.
// TODO: use the Public Suffix List.
// TODO: may be useful to move to base.js or make a utils.js file.
var baseDomainFor = (input) => {
return input.split("\.").slice(-2).join("\.");
};