Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use local locations DB #104

Merged
merged 8 commits into from
Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

### Fixed

- #98 - Add handling for `Lumen` notification with Alt Circuit ID
- #98 - Add handling for `Lumen` notification with Alt Circuit ID.
- #99 - Extend `Zayo` Html parser to handle different table headers.
- #103 - Add `Equinix` provider.
- #104 - Use a local locations DB to map city to timezone as first option, keeping API as fallback option.
- #105 - Extend `Colt` parser to support multiple `Maintenance` statuses.

## v2.0.3 - 2021-10-01

Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,7 @@ The project is following Network to Code software development guidelines and is

For any questions or comments, please check the [FAQ](FAQ.md) first and feel free to swing by the [Network to Code slack channel](https://networktocode.slack.com/) (channel #networktocode).
Sign up [here](http://slack.networktocode.com/)

## License notes

This library uses a Basic World Cities Database by Pareto Software, LLC, the owner of Simplemaps.com: The Provider offers a Basic World Cities Database free of charge. This database is licensed under the Creative Commons Attribution 4.0 license as described at: https://creativecommons.org/licenses/by/4.0/.
41,002 changes: 41,002 additions & 0 deletions circuit_maintenance_parser/data/worldcities.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions circuit_maintenance_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from circuit_maintenance_parser.errors import ParserError
from circuit_maintenance_parser.output import Status, Impact, CircuitImpact
from circuit_maintenance_parser.constants import EMAIL_HEADER_SUBJECT, EMAIL_HEADER_DATE
from circuit_maintenance_parser.utils import Geolocator

# pylint: disable=no-member

Expand All @@ -33,6 +34,8 @@ class Parser(BaseModel, extra=Extra.forbid):
# _data_types are used to match the Parser to to each type of DataPart
_data_types = ["text/plain", "plain"]

_geolocator = Geolocator()

@classmethod
def get_data_types(cls) -> List[str]:
"""Return the expected data type."""
Expand Down
3 changes: 1 addition & 2 deletions circuit_maintenance_parser/parsers/cogent.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from bs4.element import ResultSet # type: ignore

from circuit_maintenance_parser.parser import Html, Impact, CircuitImpact, Status
from circuit_maintenance_parser.utils import city_timezone

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -48,7 +47,7 @@ def parse_div(self, divs: ResultSet, data: Dict): # pylint: disable=too-many-lo
elif line.startswith("Cogent customers receiving service"):
match = re.search(r"[^Cogent].*?((\b[A-Z][a-z\s-]+)+, ([A-Za-z-]+[\s-]))", line)
if match:
parsed_timezone = city_timezone(match.group(1).strip())
parsed_timezone = self._geolocator.city_timezone(match.group(1).strip())
local_timezone = timezone(parsed_timezone)
# set start time using the local city timezone
start = datetime.strptime(start_str, "%I:%M %p %d/%m/%Y")
Expand Down
114 changes: 100 additions & 14 deletions circuit_maintenance_parser/utils.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,113 @@
"""Utility functions for the library."""
import os
import logging
from typing import Tuple, Dict, Union
import csv

from geopy.exc import GeocoderUnavailable, GeocoderTimedOut, GeocoderServiceError # type: ignore
from geopy.geocoders import Nominatim # type: ignore
from tzwhere import tzwhere # type: ignore
import backoff # type: ignore

from .errors import ParserError

logger = logging.getLogger(__name__)

dirname = os.path.dirname(__file__)


class Geolocator:
"""Class to obtain Geo Location coordinates."""

# Keeping caching of local DB and timezone in the class
db_location: Dict[Union[Tuple[str, str], str], Tuple[float, float]] = {}
timezone = None

def __init__(self):
"""Initialize instance."""
self.load_db_location()
self.load_timezone()

@classmethod
def load_timezone(cls):
"""Load the timezone resolver."""
if cls.timezone is None:
cls.timezone = tzwhere.tzwhere()
logger.info("Loaded local timezone resolver.")

@classmethod
def load_db_location(cls):
"""Load the localtions DB from CSV into a Dict."""
with open(os.path.join(dirname, "data", "worldcities.csv")) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
# Index by city and country
cls.db_location[(row["city_ascii"], row["country"])] = (float(row["lat"]), float(row["lng"]))
# Index by city (first entry wins if duplicated names)
if row["city_ascii"] not in cls.db_location:
cls.db_location[row["city_ascii"]] = (float(row["lat"]), float(row["lng"]))

def city_timezone(city: str) -> str:
"""Get the timezone for a given city.
def get_location(self, city: str) -> Tuple[float, float]:
"""Get location."""
try:
location_coordinates = self.get_location_from_local_file(city)
except ValueError:
location_coordinates = self.get_location_from_api(city)

Args:
city (str): Geographic location name
"""
try:
logger.debug(
"Resolved city %s to coordinates: lat %s - lon %s", city, location_coordinates[0], location_coordinates[1],
)
return location_coordinates

def get_location_from_local_file(self, city: str) -> Tuple[float, float]:
"""Get location from Local DB."""
city_name = city.split(", ")[0]
country = city.split(", ")[-1]

lat, lng = self.db_location.get((city_name, country), self.db_location.get(city_name, (None, None)))
if lat and lng:
logger.debug("Resolved %s to lat %s, lon %sfrom local locations DB.", city, lat, lng)
return (lat, lng)

logger.debug("City %s was not resolvable in the local locations DB.", city)
raise ValueError

@staticmethod
@backoff.on_exception(
backoff.expo, (GeocoderUnavailable, GeocoderTimedOut, GeocoderServiceError), max_time=10, logger=logger,
)
def get_location_from_api(city: str) -> Tuple[float, float]:
"""Get location from API."""
geolocator = Nominatim(user_agent="circuit_maintenance")
location = geolocator.geocode(city) # API call to OpenStreetMap web service
timezone = (
tzwhere.tzwhere()
) # TODO: Offline loading of timezone location data is quite slow. Look for better alternative
return timezone.tzNameAt(location.latitude, location.longitude)
except (GeocoderUnavailable, GeocoderTimedOut, GeocoderServiceError):
raise ParserError( # pylint: disable=raise-missing-from
"Cannot connect to the remote Geolocator API to determine timezone"
)
logger.debug("Resolved %s to %s from OpenStreetMap webservice.", city, location)
return (location.latitude, location.longitude)

def city_timezone(self, city: str) -> str:
"""Get the timezone for a given city.

Args:
city (str): Geographic location name
"""
if self.timezone is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this check needed? Same reason.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as before, maybe it assumes that the method is available for class, no instance and fails the validation

try:
latitude, longitude = self.get_location(city)
timezone = self.timezone.tzNameAt(latitude, longitude)
if not timezone:
# In some cases, given a latitued and longitued, the tzwhere library returns
# an empty timezone, so we try with the coordinates from the API as an alternative
latitude, longitude = self.get_location_from_api(city)
timezone = self.timezone.tzNameAt(latitude, longitude)

if timezone:
logger.debug("Matched city %s to timezone %s", city, timezone)
return timezone
except Exception as exc:
logger.error("Cannot obtain the timezone for city %s: %s", city, exc)
raise ParserError( # pylint: disable=raise-missing-from
f"Cannot obtain the timezone for city {city}: {exc}"
)
raise ParserError("Timezone resolution not properly initalized.")


def rgetattr(obj, attr):
Expand Down
26 changes: 13 additions & 13 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ bs4 = "^0.0.1"
lxml = "^4.6.2"
geopy = "^2.1.0"
tzwhere = "^3.0.3"
backoff = "^1.11.1"

[tool.poetry.dev-dependencies]
pytest = "^6.2.2"
Expand Down
47 changes: 23 additions & 24 deletions tests/unit/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
GenericProvider,
AquaComms,
AWS,
# Cogent,
Cogent,
Colt,
EUNetworks,
HGC,
Expand Down Expand Up @@ -62,29 +62,28 @@
[Path(dir_path, "data", "aws", "aws2_result.json"),],
),
# Cogent
# TODO: Recover tests back when issue #101 is fixed
# (
# Cogent,
# [
# ("html", Path(dir_path, "data", "cogent", "cogent1.html")),
# (EMAIL_HEADER_DATE, Path(dir_path, "data", "date", "email_date_1")),
# ],
# [
# Path(dir_path, "data", "cogent", "cogent1_result.json"),
# Path(dir_path, "data", "date", "email_date_1_result.json"),
# ],
# ),
# (
# Cogent,
# [
# ("html", Path(dir_path, "data", "cogent", "cogent2.html")),
# (EMAIL_HEADER_DATE, Path(dir_path, "data", "date", "email_date_1")),
# ],
# [
# Path(dir_path, "data", "cogent", "cogent2_result.json"),
# Path(dir_path, "data", "date", "email_date_1_result.json"),
# ],
# ),
(
Cogent,
[
("html", Path(dir_path, "data", "cogent", "cogent1.html")),
(EMAIL_HEADER_DATE, Path(dir_path, "data", "date", "email_date_1")),
],
[
Path(dir_path, "data", "cogent", "cogent1_result.json"),
Path(dir_path, "data", "date", "email_date_1_result.json"),
],
),
(
Cogent,
[
("html", Path(dir_path, "data", "cogent", "cogent2.html")),
(EMAIL_HEADER_DATE, Path(dir_path, "data", "date", "email_date_1")),
],
[
Path(dir_path, "data", "cogent", "cogent2_result.json"),
Path(dir_path, "data", "date", "email_date_1_result.json"),
],
),
# Colt
(
Colt,
Expand Down
22 changes: 11 additions & 11 deletions tests/unit/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from circuit_maintenance_parser.parsers.aquacomms import HtmlParserAquaComms1, SubjectParserAquaComms1
from circuit_maintenance_parser.parsers.aws import SubjectParserAWS1, TextParserAWS1

# from circuit_maintenance_parser.parsers.cogent import HtmlParserCogent1
from circuit_maintenance_parser.parsers.cogent import HtmlParserCogent1
from circuit_maintenance_parser.parsers.colt import CsvParserColt1
from circuit_maintenance_parser.parsers.equinix import HtmlParserEquinix, SubjectParserEquinix
from circuit_maintenance_parser.parsers.gtt import HtmlParserGTT1
Expand Down Expand Up @@ -76,16 +76,16 @@
Path(dir_path, "data", "aws", "aws2_subject_parser_result.json"),
),
# Cogent
# TODO: Recover testing when issue #101 is fixed # (
# HtmlParserCogent1,
# Path(dir_path, "data", "cogent", "cogent1.html"),
# Path(dir_path, "data", "cogent", "cogent1_result.json"),
# ),
# (
# HtmlParserCogent1,
# Path(dir_path, "data", "cogent", "cogent2.html"),
# Path(dir_path, "data", "cogent", "cogent2_result.json"),
# ),
(
HtmlParserCogent1,
Path(dir_path, "data", "cogent", "cogent1.html"),
Path(dir_path, "data", "cogent", "cogent1_result.json"),
),
(
HtmlParserCogent1,
Path(dir_path, "data", "cogent", "cogent2.html"),
Path(dir_path, "data", "cogent", "cogent2_result.json"),
),
# Colt
(
CsvParserColt1,
Expand Down
Loading