forked from justmarkham/DAT7
-
Notifications
You must be signed in to change notification settings - Fork 1
/
07_web_scraping.py
259 lines (197 loc) · 7.87 KB
/
07_web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
'''
CLASS: Web Scraping with Beautiful Soup
What is web scraping?
- Extracting information from websites (simulates a human copying and pasting)
- Based on finding patterns in website code (usually HTML)
What are best practices for web scraping?
- Scraping too many pages too fast can get your IP address blocked
- Pay attention to the robots exclusion standard (robots.txt)
- Let's look at http://www.imdb.com/robots.txt
What is HTML?
- Code interpreted by a web browser to produce ("render") a web page
- Let's look at example.html
- Tags are opened and closed
- Tags have optional attributes
How to view HTML code:
- To view the entire page: "View Source" or "View Page Source" or "Show Page Source"
- To view a specific part: "Inspect Element"
- Safari users: Safari menu, Preferences, Advanced, Show Develop menu in menu bar
- Let's inspect example.html
'''
# read the HTML code for a web page and save as a string
with open('example.html', 'rU') as f:
html = f.read()
# convert HTML into a structured Soup object
from bs4 import BeautifulSoup
b = BeautifulSoup(html)
# print out the object
print b
print b.prettify()
# 'find' method returns the first matching Tag (and everything inside of it)
b.find(name='body')
b.find(name='h1')
# Tags allow you to access the 'inside text'
b.find(name='h1').text
# Tags also allow you to access their attributes
b.find(name='h1')['id']
# 'find_all' method is useful for finding all matching Tags
b.find(name='p') # returns a Tag
b.find_all(name='p') # returns a ResultSet (like a list of Tags)
# ResultSets can be sliced like lists
len(b.find_all(name='p'))
b.find_all(name='p')[0]
b.find_all(name='p')[0].text
b.find_all(name='p')[0]['id']
# iterate over a ResultSet
results = b.find_all(name='p')
for tag in results:
print tag.text
# limit search by Tag attribute
b.find(name='p', attrs={'id':'scraping'})
b.find_all(name='p', attrs={'class':'topic'})
# limit search to specific sections
b.find_all(name='li')
b.find(name='ul', attrs={'id':'scraping'}).find_all(name='li')
'''
EXERCISE ONE
'''
# find the 'h2' tag and then print its text
b.find(name='h2').text
# find the 'p' tag with an 'id' value of 'reproducibility' and then print its text
b.find(name='p', attrs={'id':'reproducibility'}).text
# find the first 'p' tag and then print the value of the 'id' attribute
b.find(name='p')['id']
# print the text of all four resources
results = b.find_all(name='li')
for tag in results:
print tag.text
# print the text of only the API resources
results = b.find(name='ul', attrs={'id':'api'}).find_all(name='li')
for tag in results:
print tag.text
'''
Scraping the IMDb website
'''
# get the HTML from the Shawshank Redemption page
import requests
r = requests.get('http://www.imdb.com/title/tt0111161/')
# convert HTML into Soup
b = BeautifulSoup(r.text)
print b
# run this code if you have encoding errors
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# get the title
b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'name'})
b.find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text
b.find(name='h1').find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text
# get the star rating
float(b.find(name='span', attrs={'itemprop':'ratingValue'}).text)
float(b.find(name='div', attrs={'class':'titlePageSprite star-box-giga-star'}).text)
'''
EXERCISE TWO
'''
# get the description
b.find(name='p', attrs={'itemprop':'description'}).text.strip()
# get the content rating
b.find(name='meta', attrs={'itemprop':'contentRating'})['content']
# get the duration in minutes (as an integer)
int(b.find(name='time', attrs={'itemprop':'duration'})['datetime'][2:-1])
int(b.find(name='time', attrs={'itemprop':'duration'}).text.strip()[:-4])
'''
OPTIONAL HOMEWORK
First, define a function that accepts an IMDb ID and returns a dictionary of
movie information: title, star_rating, description, content_rating, duration.
(This is really just a wrapper of the web scraping code we wrote above.)
For example, get_movie_info('tt0111161') returns:
{'content_rating': 'R',
'description': u'Two imprisoned men bond over a number of years...',
'duration': 142,
'star_rating': 9.3,
'title': u'The Shawshank Redemption'}
Then, open the file imdb_ids.txt using Python, and write a for loop that builds
a list in which each element is a dictionary of movie information.
Finally, convert that list into a DataFrame.
'''
# define a function that accepts an IMDb ID and returns a dictionary of movie information
def get_movie_info(imdb_id):
r = requests.get('http://www.imdb.com/title/' + imdb_id + '/')
b = BeautifulSoup(r.text)
info = {}
info['title'] = b.find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text
info['star_rating'] = float(b.find(name='span', attrs={'itemprop':'ratingValue'}).text)
info['description'] = b.find(name='p', attrs={'itemprop':'description'}).text.strip()
info['content_rating'] = b.find(name='meta', attrs={'itemprop':'contentRating'})['content']
info['duration'] = int(b.find(name='time', attrs={'itemprop':'duration'})['datetime'][2:-1])
return info
# test the function
get_movie_info('tt0111161')
# open the file of IDs (one ID per row), and store the IDs in a list
imdb_ids = []
with open('imdb_ids.txt', 'rU') as f:
imdb_ids = [row.strip() for row in f]
# get the information for each movie, and store the results in a list
from time import sleep
movies = []
for imdb_id in imdb_ids:
movies.append(get_movie_info(imdb_id))
sleep(1)
# check that the list of IDs and list of movies are the same length
assert(len(imdb_ids) == len(movies))
# convert the list of movies into a DataFrame
import pandas as pd
pd.DataFrame(movies, index=imdb_ids)
'''
Another IMDb example: Getting the genres
'''
# read the Shawshank Redemption page again
r = requests.get('http://www.imdb.com/title/tt0111161/')
b = BeautifulSoup(r.text)
# only gets the first genre
b.find(name='span', attrs={'class':'itemprop', 'itemprop':'genre'})
# gets all of the genres
b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'genre'})
# stores the genres in a list
[tag.text for tag in b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'genre'})]
'''
Another IMDb example: Getting the writers
'''
# attempt to get the list of writers (too many results)
b.find_all(name='span', attrs={'itemprop':'name'})
# limit search to a smaller section to only get the writers
b.find(name='div', attrs={'itemprop':'creator'}).find_all(name='span', attrs={'itemprop':'name'})
'''
Another IMDb example: Getting the URLs of cast images
'''
# find the images by size
results = b.find_all(name='img', attrs={'height':'44', 'width':'32'})
# check that the number of results matches the number of cast images on the page
len(results)
# iterate over the results to get all URLs
for tag in results:
print tag['loadlate']
'''
Useful to know: Alternative Beautiful Soup syntax
'''
# read the example web page again
with open('example.html', 'rU') as f:
html = f.read()
# convert to Soup
b = BeautifulSoup(html)
# these are equivalent
b.find(name='p') # normal way
b.find('p') # 'name' is the first argument
b.p # can also be accessed as an attribute of the object
# these are equivalent
b.find(name='p', attrs={'id':'scraping'}) # normal way
b.find('p', {'id':'scraping'}) # 'name' and 'attrs' are the first two arguments
b.find('p', id='scraping') # can write the attributes as arguments
# these are equivalent
b.find(name='p', attrs={'class':'topic'}) # normal way
b.find('p', class_='topic') # 'class' is special, so it needs a trailing underscore
b.find('p', 'topic') # if you don't name it, it's assumed to be the class
# these are equivalent
b.find_all(name='p') # normal way
b.findAll(name='p') # old function name from Beautiful Soup 3
b('p') # if you don't name the method, it's assumed to be find_all