-
Notifications
You must be signed in to change notification settings - Fork 7
/
utils.py
91 lines (79 loc) · 2.88 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from urllib.parse import urlparse
from urllib3.exceptions import MaxRetryError
import tldextract
import publicsuffix2
import csv
import subprocess
import json
CONFIG_FILE = 'cookinspect.conf'
CONFIG_FILE_PROD = 'cookinspect_prod.conf'
CONFIG_FILE_AC = 'cookinspect_ac.conf'
# consent string set by senscritique.com p, 2019.07.25
CONSENT_STRING_SENSCRITIQUE = 'BOkQjswOkQjswBcAcBFRCc-AAAApMhv4XjiARsho1NRBJgABALiAiAAAQAAYABIFAAASgABBCAkAgAAAA4gAAEAAAABIBIAAAAAAAgAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
SLEEP_TIME_BUTTON_CLICK = 2
SLEEP_TIME_GET_LOGS = 5
# On the worst observed case among 222 website loading __cmp after a
# 5s wait, the worst case (lemondeinformatique.fr) needed 2s to load
# it. Some (5 websites) needed 1s, all the rest no delay at all (after
# Selenium considers the page is loaded). So we might miss websites
# with a 2s delay, but we probably won't, or very few of them.
SLEEP_TIME_CMP_WAIT = 3
SLEEP_TIME_REFRESH = 3
SLEEP_TIME_COOKIE_WAIT = 2
TIMEOUT = 10
MAX_TRIES_TIMEOUT = 3
# states (do not edit)
BEFORE_ACTION = 1
AFTER_REFUSAL = 2
AFTER_ACCEPTANCE = 3
def get_domain(url, subdomain=False):
if subdomain:
return urlparse(url).netloc
else:
extracted = tldextract.extract(url)
return "{}.{}".format(extracted.domain, extracted.suffix)
def url_to_domain(url, psl):
parsed_uri = urlparse(url)
domain = publicsuffix2.get_sld('{uri.netloc}'.format(uri=parsed_uri), psl)
return domain
def quit_properly(browser):
try:
browser.close()
except MaxRetryError as e:
# Example: https://www.swingerdreamland.hu
print("Error while trying to close browser (maybe it's already closed?): %s" % e)
browser.quit()
def import_iab_cmp_list(short_names=False):
CMP = {}
if short_names:
f = "cmplist/IAB_CMP_list_full.csv"
else:
f = 'cmplist/IAB_CMP_list_full_fullnames.csv'
reader = csv.reader(open(f, 'r'))
first_line = True
for row in reader:
if first_line:
first_line = False
continue
cmp_id = row[0]
name = row[1]
CMP[int(cmp_id)] = name
return CMP
def decode_consent_string(consent_string, v2=False):
if v2:
script = "decode_IAB_API_strings/decode_IAB_API_strings_v2.js"
else:
script = "decode_IAB_API_strings/decode_IAB_API_strings.js"
proc = subprocess.Popen(['node', script , consent_string], stdout=subprocess.PIPE)
(out, err) = proc.communicate()
if proc.returncode != 0:
print("Unable to decode consent string.")
return None
return json.loads(out)
def get_vendor_list(vendorlist_id=163):
with open('vendorlist/vendorlist_%d.json' % int(vendorlist_id)) as json_file:
vendorlist = json.load(json_file)
return vendorlist