-
Notifications
You must be signed in to change notification settings - Fork 0
/
hec.py
84 lines (62 loc) · 1.75 KB
/
hec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# coding=utf-8
# ©2021, Jean-Hugues Roy - Licence GNU GPL v3
from bs4 import BeautifulSoup
import csv, requests, io, time
from PyPDF2 import PdfFileReader
from selenium import webdriver
finput = "hecURLs.csv"
fichier = "hec2021.csv"
entetes1 = {"User-Agent":"Jean-Hugues Roy, UQAM, [email protected] - moissonnage en vue d'un article pour la revue de l'ACFAS"}
f1 = open(finput)
uuids = csv.reader(f1)
for uuid in sorted(uuids):
annee = 0
departement = "inconnu"
diplome = "inconnu"
urlThese = "http://reflexion.hec.ca/notice?id={}".format(uuid[1])
print(urlThese)
yo = webdriver.Chrome()
yo.get(urlThese)
time.sleep(4)
resultats = yo.page_source
page2 = BeautifulSoup(resultats,"html.parser")
titre = page2.find("title").text
auteur = "?"
divs = page2.find_all("div")
try:
for div in divs:
if div.text == "Date de diplomation":
annee = div.find_next("span").text.strip()
except:
annee = 0
try:
for div in divs:
if div.text == "Programme":
departement = div.find_next("span").text.strip()
except:
departement = "inconnu"
try:
for div in divs:
if div.text == "Cheminement":
diplome = div.find_next("span").text.strip()
except:
diplome = "inconnu"
for div in divs:
if div.text == "Libre accès à la publication":
urlPDF = div.find_next("a")["href"]
try:
response = requests.get(urlPDF)
with io.BytesIO(response.content) as open_pdf_file:
read_pdf = PdfFileReader(open_pdf_file)
num_pages = read_pdf.getNumPages()
# print(num_pages)
except:
num_pages = None
langue = "inconnue"
infos = ["HEC", titre, annee, auteur, departement, langue, diplome, num_pages, urlThese]
print(infos)
asterix = open(fichier,"a")
obelix = csv.writer(asterix)
obelix.writerow(infos)
yo.close()
print("$"*10)