-
Notifications
You must be signed in to change notification settings - Fork 0
/
ohrs-scraper.py
123 lines (101 loc) · 4.3 KB
/
ohrs-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
import requests
import re
import geopy
from geopy.distance import geodesic
from geopy.geocoders import Nominatim, ArcGIS
from prettytable import PrettyTable
# This script scrapes the OHRS system for available huts for a selected date.
# a bit like https://magazin.alpenverein.de/artikel/last-minute-huettenbuchung_5e154190-2c02-47a0-80fe-2cd4da0ba550#/
# but not just for the upcoming two days
START_DATE = "01.09.2023"
NIGHTS = 2 # max 14 nights
PERSONS = 2
HOME = "München, Germany" # Enter home address here
URL = "https://www.alpsonline.org/reservation"
MAX_ID = 700 # no idea, experimentally determined max value of hut_id
results = []
# WARNING: Parsing of the hut geolocations and calculation of the distance does
# not work good. Please don't rely on it.
# Also, the formats are pretty wild (UTM, CH1903) and geopy can't parse them
# properly
# Also I've most likely used it wrong.
geolocator = Nominatim(user_agent="OHRS-scraper")
location = geolocator.geocode(HOME)
print("Using Home Address: " + location.address)
home = (location.latitude, location.longitude)
session = requests.Session()
for hut_id in range(1, MAX_ID):
# skip bad formatted data, maybe check manually later
if hut_id in [446, 526, 607]: continue
# parse the info block somehow...
r = session.get(URL + f"/calendar?hut_id={hut_id}&lang=en")
m = r.text.find('<div class="info">')
n = r.text[m:].find('</div>') + m
t = r.text[m:n]
name = re.search(r'(?<=>).+(?=<)', t)[0]
# check various "placeholders" for empty or non-available huts
if f"Hut warden(s): </span>" in t: continue # hut not found
if f"Hut warden(s): -</span>" in t: continue # hut not found
if f"Hut warden(s): -----</span>" in t: continue # hut not found
if f"Hut warden(s): XX</span>" in t: continue # hut not found
print(f"\r{hut_id:03}/{MAX_ID}, {name}{30*' '}", end="")
# get JSON
# this contains availability information for the 14 days after the START_DATE
# the hut_id is selected by the previous request. hence we have to reuse
# the session cookie!
r = session.get(URL + f"/selectDate?date={START_DATE}")#not necessary? , cookies = r.cookies)
# number of nights with free beds for all persons
freenights = 0
# iterate over date range
for j in range(NIGHTS):
# iterate over available room classes
# TODO: figure out bedCategoryId:
#1 Massenlager
#2 Zimmer (?)
#4 Matratzenlager
#5 Mehrbettzimmer
#6 zweierzimmer
#7 Matratzenlager
#8 Mehrbettzimmer
#9 Zweierzimmer
#13 Doppelzimmer
#14 4er Zimmer
#15 6-er zimmer
#18 Spezialzimmer
#20 einzelzimmer
for night in r.json()[f"{j}"]:
if night["bedCategoryType"] != "ROOM": continue
if night["freeRoom"] < PERSONS: continue
freenights = freenights + 1
break
# we have found a matching hut!
if freenights == NIGHTS:
# parse some more data
height = re.search(r"(?<=Height above sea level: )[0-9\.']+", t)[0]
height = int( height.replace(".","").replace("'","") )
coordinates = re.search(r'(?<=Coordinates: ).+(?=</)', t)[0]
coordinates = coordinates.replace("’","'").replace("′","'").replace(",",".")
try:
# try to calculate the distance between HOME and the hut
# this is probably rubbish!
location = geopy.Point(coordinates)
#location = ArcGIS().geocode(coordinates)
distance = geodesic(home, location).km
except (TypeError, ValueError) as e:
# some coordinates are messed up and e.g. in CH1903 format. we ignore these
distance = 999999
results.append({"hut_id":hut_id,
"name":name,
"height":height,
"distance":round(distance,1),
"coordinates":coordinates})
# try to sort by distance
results.sort(key= lambda x:x["distance"])
table = PrettyTable()
table.field_names = ["ID", "Hut","Height [m]", "Distance [km]", "Coordinates"]
for r in results:
table.add_row([r["hut_id"],r["name"],r["height"],r["distance"],r["coordinates"]])
with open("result.csv", 'w') as fp:
fp.write(table.get_formatted_string("csv"))
print(table)