-
Notifications
You must be signed in to change notification settings - Fork 0
/
sf_warc_iter.py
101 lines (80 loc) · 3.25 KB
/
sf_warc_iter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import socialfeedtools.warc as warc
import json
import argparse
import socialfeedtools.utils as utils
import logging
from urllib3.exceptions import ProtocolError
log = logging.getLogger(__name__)
def tumblr_response_iter(record):
json_obj = json.load(record.http_response)
yield "tumblr_blog", json_obj["response"]["blog"]
for post in json_obj["response"]["posts"]:
yield "tumblr_post", post
def flickr_response_iter(record):
json_obj = json.load(record.http_response)
if "photo" in json_obj:
yield "flickr_photo", json_obj["photo"]
elif "person" in json_obj:
yield "flickr_person", json_obj["person"]
def twitter_rest_response_iter(record):
json_obj = json.load(record.http_response)
for tweet in json_obj["statuses"]:
yield "tweet", tweet
def twitter_stream_response_iter(record):
try:
for tweet in utils.iter_lines(record.http_response):
try:
yield "tweet", json.loads(tweet)
except ValueError:
#Bad tweet
pass
except ProtocolError:
#Last chunk incomplete
pass
to_service_dict = {
utils.is_tumblr_url: "tumblr",
utils.is_flickr_url: "flickr",
utils.is_twitter_rest_url: "twitter_rest",
utils.is_twitter_stream_url: "twitter_stream"
}
def get_service(record):
url = record.header["WARC-Target-URI"]
for is_func, service in to_service_dict.iteritems():
if is_func(url):
return service
return None
service_to_iter_func_dict = {
"tumblr": tumblr_response_iter,
"flickr": flickr_response_iter,
"twitter_rest": twitter_rest_response_iter,
"twitter_stream": twitter_stream_response_iter
}
def iter_warc(filepath, services=(), entities=(), pretty=False):
log.info("File %s", filepath)
f = warc.WARCResponseFile(filepath)
try:
for count, record in enumerate(f):
#Determine the service
service = get_service(record)
#Iterate over the iter for the service
if service and (not services or service in services):
for entity_type, entity_obj in service_to_iter_func_dict[service](record):
if not entities or entity_type in entities:
print json.dumps(entity_obj, indent=4 if pretty else None)
finally:
f.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--services",
help="A comma separated list of services to limit the results to. "
"Services are: twitter, tumblr, flickr, other.")
parser.add_argument("--entities",
help="A comma separated list of entities to limit the results to. "
"Entities are: tweet, tumblr_blog, tumblr_post, flickr_photo, flickr_person.")
parser.add_argument("--pretty", action="store_true", help="Format the json for viewing.")
parser.add_argument("filepath", nargs="+", help="Filepath of the warc.")
args = parser.parse_args()
scvs = args.services.split(",") if args.services else ()
ents = args.entities.split(",") if args.entities else ()
for fp in args.filepath:
iter_warc(fp, services=scvs, pretty=args.pretty, entities=ents)