Skip to content

Commit

Permalink
Re-run geocoder in 2024 (#121)
Browse files Browse the repository at this point in the history
* port coder / test to Python 3 and pytest

* fix some more tests

* commit intersections.json

* port coders to python 3

* Mostly able to regenerate geocodes!

* restore bug for perfect repro

* notes

* commit records.pickle

* unzip geocache

* drop records.pickle
  • Loading branch information
danvk authored Aug 27, 2024
1 parent 6bae712 commit 463f1c3
Show file tree
Hide file tree
Showing 19 changed files with 14,620 additions and 134 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ feedback.json
.vscode
.DS_Store
.nypl-token.txt
geocache
1 change: 1 addition & 0 deletions coders/extended-grid-cases.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
18th Street and Avenue A, Manhattan, NY
17 changes: 8 additions & 9 deletions coders/extended_grid.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
'''Geocode intersections by extending the existing NYC grid.
This lets us cover intersections which no longer exist, but may have in the
Expand All @@ -11,7 +11,6 @@
grid/extrapolate.py.
'''

import json
import re
import fileinput
import sys
Expand Down Expand Up @@ -96,7 +95,7 @@ def extract_ordinal(txt):

def multisearch(re_dict, txt):
'''Search for any of the keys. Given a match, return the value.'''
for k, v in re_dict.iteritems():
for k, v in re_dict.items():
if re.search(k, txt, flags=re.I):
return v
return None
Expand All @@ -107,17 +106,17 @@ def __init__(self):
# This is done here to avoid the milstein registering itself.
from coders.milstein import cross_patterns
self._cross_patterns = cross_patterns

def _extractLocationStringFromRecord(self, r):
raw_loc = r.location().strip()
loc = re.sub(r'^[ ?\t"\[]+|[ ?\t"\]]+$', '', raw_loc)
return loc

def codeRecord(self, r):
if r.source() != 'Milstein Division': return None

loc = self._extractLocationStringFromRecord(r)

m = None
for pattern in self._cross_patterns:
m = re.match(pattern, loc)
Expand All @@ -131,7 +130,7 @@ def codeRecord(self, r):
try:
avenue, street = parse_street_ave(street1, street2)
except ValueError as e:
sys.stderr.write('%s: %s\n' % (loc, e.message))
sys.stderr.write('%s: %s\n' % (loc, str(e)))
return None

# Special cases
Expand Down Expand Up @@ -192,7 +191,7 @@ def name(self):
}
result = grid_coder.codeRecord(r)

print '"%s" -> %s' % (addr, result)
print('"%s" -> %s' % (addr, result))
if result:
num_ok += 1
else:
Expand Down
12 changes: 6 additions & 6 deletions coders/milstein.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def codeRecord(self, r):
'source': loc,
'type': 'intersection'
}

for pattern in addr_patterns:
m = re.match(pattern, loc)
if m: break
Expand All @@ -103,7 +103,7 @@ def codeRecord(self, r):
'source': loc,
'type': 'street_address'
}

for pattern in place_patterns:
m = re.match(pattern, loc)
if m: break
Expand All @@ -116,7 +116,7 @@ def codeRecord(self, r):
'type': 'street_address' # or 'point_of_interest' or 'establishment'
}

sys.stderr.write('(%s) Bad location: %s\n' % (r.photo_id(), loc));
sys.stderr.write('(%s) Bad location: %s\n' % (r.photo_id(), loc))
return None


Expand Down Expand Up @@ -147,7 +147,7 @@ def _getBoroughFromAddress(self, address):

def getLatLonFromGeocode(self, geocode, data, r):
'''Extract (lat, lon) from a Google Maps API response. None = failure.
This ensures that the geocode is in the correct borough. This helps catch
errors involving identically-named crosstreets in multiple boroughs.
'''
Expand All @@ -163,7 +163,7 @@ def getLatLonFromGeocode(self, geocode, data, r):
sys.stderr.write('Borough mismatch: "%s" (%s) geocoded to %s\n' % (
self._extractLocationStringFromRecord(r), record_boro, geocode_boro))
return None

return (lat, lon)

def finalize(self):
Expand Down Expand Up @@ -191,7 +191,7 @@ def name(self):
}
result = coder.codeRecord(r)

print '"%s" -> %s' % (addr, result)
print('"%s" -> %s' % (addr, result))
if result:
num_ok += 1
else:
Expand Down
8 changes: 5 additions & 3 deletions coders/nyc_parks.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def codeRecord(self, r):
'source': m.group(0),
'type': 'point_of_interest'
}

m = re.search(island_re, title)
if m:
island = m.group(1)
Expand All @@ -263,6 +263,8 @@ def codeRecord(self, r):
m = re.search(bridge_re, title)
if m:
bridge = m.group(1)
# if not ('Bridge' in bridge or 'bridge' in bridge):
# XXX this is weird
if not 'Bridge' in bridge or 'bridge' in bridge:
bridge += ' Bridge'
if bridge not in bridges:
Expand Down Expand Up @@ -292,7 +294,7 @@ def getLatLonFromGeocode(self, geocode, data, r):

def finalize(self):
for missing in [missing_parks, missing_islands, missing_bridges]:
vs = [(v, k) for k, v in missing.iteritems()]
vs = [(v, k) for k, v in missing.items()]
for v, k in reversed(sorted(vs)):
sys.stderr.write('%4d\t%s\n' % (v, k))

Expand All @@ -319,7 +321,7 @@ def name(self):

if result:
num_ok += 1
print '"%s" -> %s' % (addr, result)
print('"%s" -> %s' % (addr, result))
else:
num_bad += 1

Expand Down
17 changes: 8 additions & 9 deletions generate-geocodes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
#
# This is the main driver for the geocoding process.
# Inputs are the records pickle and a collection of 'coders'.
Expand All @@ -14,7 +14,6 @@
import geocoder
import generate_js
import json
import cPickle

# Import order here determines the order in which coders get a crack at each
# record. We want to go in order from precise to imprecise.
Expand Down Expand Up @@ -77,7 +76,7 @@
# TODO(danvk): does this belong here?
lat_lon_map = {}
if options.lat_lon_map:
for line in file(options.lat_lon_map):
for line in open(options.lat_lon_map):
line = line.strip()
if not line: continue
old, new = line.split('->')
Expand All @@ -91,7 +90,7 @@
# Load existing geocodes, if applicable.
id_to_located_rec = {}
if options.previous_geocode_json:
prev_recs = json.load(file(options.previous_geocode_json))
prev_recs = json.load(open(options.previous_geocode_json))
for rec in prev_recs:
if 'extracted' in rec and 'latlon' in rec['extracted']:
x = rec['extracted']
Expand All @@ -107,7 +106,7 @@
for idx, r in enumerate(rs):
if idx % 100 == 0 and idx > 0:
sys.stderr.write('%5d / %5d records processed\n' % (1+idx, len(rs)))

located_rec = (r, None, None)

# Early-out if we've already successfully geocoded this record.
Expand All @@ -126,8 +125,8 @@

if not g:
if options.print_recs:
print '%s\t%s\t%s' % (
c.name(), r.photo_id(), json.dumps(location_data))
print('%s\t%s\t%s' % (
c.name(), r.photo_id(), json.dumps(location_data)))
stats[c.name()] += 1
located_rec = (r, c.name(), location_data)
break
Expand All @@ -150,9 +149,9 @@
location_data['lat'] = lat_lon[0]
location_data['lon'] = lat_lon[1]
if options.print_recs:
print '%s\t%f,%f\t%s\t%s' % (
print('%s\t%f,%f\t%s\t%s' % (
r.photo_id(), lat_lon[0], lat_lon[1], c.name(),
json.dumps(location_data))
json.dumps(location_data)))
stats[c.name()] += 1
located_rec = (r, c.name(), location_data)
break
Expand Down
27 changes: 14 additions & 13 deletions generate_js.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def _generateJson(located_recs, lat_lon_map):
points = 0
photos = 0
is_first = True
for lat_lon, recs in ll_to_id.iteritems():
for lat_lon, recs in ll_to_id.items():
sorted_recs = sorted([r for r in recs
if r.date_range() and r.date_range()[1]],
key=lambda r: r.date_range()[1])
Expand Down Expand Up @@ -81,17 +81,17 @@ def _generateJson(located_recs, lat_lon_map):
def printJson(located_recs, lat_lon_map):
data = _generateJson(located_recs, lat_lon_map)

print "var lat_lons = "
print json.dumps(data)
print("var lat_lons = ")
print(json.dumps(data, sort_keys=True))


def printJsonNoYears(located_recs, lat_lon_map):
data = _generateJson(located_recs, lat_lon_map)
for k, v in data.iteritems():
for k, v in data.items():
data[k] = [x[2] for x in v] # drop both year elements.

print "var lat_lons = "
print json.dumps(data)
print("var lat_lons = ")
print(json.dumps(data, sort_keys=True))


def printRecordsJson(located_recs):
Expand All @@ -108,7 +108,8 @@ def printRecordsJson(located_recs):
'date_range': [ None, None ]
}
}
if r.note(): rec['note'] = r.note()
if r.note():
rec['note'] = r.note()

start, end = r.date_range()
rec['extracted']['date_range'][0] = '%04d-%02d-%02d' % (
Expand All @@ -128,7 +129,7 @@ def printRecordsJson(located_recs):
raise e

recs.append(rec)
print json.dumps(recs, indent=2)
print(json.dumps(recs, indent=2, sort_keys=True))


def printRecordsText(located_recs):
Expand All @@ -145,18 +146,18 @@ def printRecordsText(located_recs):
else:
loc = 'n/a\tn/a'

print '\t'.join([r.photo_id(), date, folder, title, r.preferred_url, coder or 'failed', loc])
print('\t'.join([r.photo_id(), date, folder, title, r.preferred_url, coder or 'failed', loc]))


def printLocations(located_recs):
locs = defaultdict(int)
for r, coder, location_data in located_recs:
if not location_data: continue
if not 'lat' in location_data: continue
if not 'lon' in location_data: continue
if 'lat' not in location_data: continue
if 'lon' not in location_data: continue
lat = location_data['lat']
lon = location_data['lon']
locs['%.6f,%.6f' % (lat, lon)] += 1

for ll, count in locs.iteritems():
print '%d\t%s' % (count, ll)
for ll, count in locs.items():
print('%d\t%s' % (count, ll))
23 changes: 12 additions & 11 deletions geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
# Maintains a cache of previously-geocoded locations and throttles traffic to the Geocoder.

import base64
import os
import re
import sys
import time
import json
import urllib
import urllib.parse

GeocodeUrlTemplate = 'https://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=%s'
CacheDir = "geocache"
Expand All @@ -28,7 +28,7 @@


def _cache_file(loc):
key = base64.b64encode(loc)[:-2] # minus the trailing '=='
key = base64.b64encode(loc.encode('utf8'))[:-2].decode('ascii') # minus the trailing '=='
key = key.replace('/', '-') # '/' is bad in a file name.
key = key[:255] # longest possible filename
return "%s/%s" % (CacheDir, key)
Expand All @@ -46,13 +46,13 @@ def _check_cache(self, loc):
if CacheDebug:
sys.stderr.write('Checking %s\n' % cache_file);
try:
return file(cache_file).read()
return open(cache_file).read()
except:
return None

def _cache_result(self, loc, result):
cache_file = _cache_file(loc)
file(cache_file, "w").write(result)
open(cache_file, "w").write(result)

def _fetch(self, url):
"""Attempts to fetch the URL. Does rate throttling. Returns XML."""
Expand All @@ -76,17 +76,17 @@ def _check_for_lat_lon(self, address):

def Locate(self, address, check_cache=True):
"""Returns a maps API JSON response for the address or None.
Address should be a fully-qualified address, e.g.
'111 8th Avenue, New York, NY'.
"""
url = GeocodeUrlTemplate % urllib.quote(address)
url = GeocodeUrlTemplate % urllib.parse.quote(address)

data = None
from_cache = False
if check_cache:
data = self._check_cache(address)
from_cache = data != None
from_cache = data is not None
if not data:
data = self._check_for_lat_lon(address)
if not data:
Expand All @@ -103,7 +103,7 @@ def Locate(self, address, check_cache=True):
sys.stderr.write('Error status %s %s\n' % (status, json.dumps(response)))
if status == 'OVER_QUERY_LIMIT':
raise Exception('Over your quota for the day!')

return None
if not from_cache and response:
self._cache_result(address, data)
Expand All @@ -112,15 +112,16 @@ def Locate(self, address, check_cache=True):

def InCache(self, loc):
data = self._check_cache(loc)
return data == None
return data is None # XXX this looks backwards

def LocateFromCache(self, loc):
"""Like Locate, but never goes to the network to get a location."""
data = self._check_cache(loc)
if not data: return None
if not data:
return None
return json.loads(data)


if __name__ == '__main__':
for arg in sys.argv[1:]:
print '%s --> %s' % (arg, _cache_file(arg))
print('%s --> %s' % (arg, _cache_file(arg)))
Loading

0 comments on commit 463f1c3

Please sign in to comment.