Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented Remote, Search within State or Province, and Search within Country #113

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions demo/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,29 @@ search:
- MONSTER

# Region that we are searching for jobs within:
province_or_state: "ON" # NOTE: this is generally 2 characters long.
city: "Waterloo" # NOTE: this is the full city / town name.
# NOTE: this is generally 2 characters long, but can be 'Null' (no quotes)
# to search by country, or can be some value while city is 'Null' to search
# within the province or state.
province_or_state: "ON"
# NOTE: this is the full city / town name, but can be 'Null', with
# province_or_state set to 'Null' to search country-wide.
city: "Waterloo"
radius: 25 # km (NOTE: if we were in locale: USA_ENGLISH it's in miles)
remote_within_country: False

# To search entire country if one is willing to relocate anywhere:
# province_or_state: Null
# city: Null
# remote_within_country: False
# To search within entire province or state (radius will not be accounted):
# province_or_state: (ie. Scotland)
# city: Null
# remote_within_country: False
# To search entire country for remote roles:
# province_or_state: Null
# city: Null
# remote_within_country: True


# These are the terms you would be typing into the website's search field:
keywords:
Expand All @@ -41,8 +61,8 @@ search:
company_block_list:
- "Infox Consulting"

# The desired level of work-remoteness (i.e. IN_PERSON, FULLY_REMOTE, ANY,
# TEMPORARILY_REMOTE, PARTIALLY_REMOTE)
# The desired level of work-remoteness if label exists in job listing
#(i.e. IN_PERSON, FULLY_REMOTE, ANY, TEMPORARILY_REMOTE, PARTIALLY_REMOTE)
remoteness: ANY

# Logging level options are: critical, error, warning, info, debug, notset
Expand Down
23 changes: 21 additions & 2 deletions demo/settings_USA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,28 @@ search:
- MONSTER

# Region that we are searching for jobs within:
province_or_state: "Texas" # NOTE: this is generally 2 characters long.
city: "Richardson" # NOTE: this is the full city / town name.
# NOTE: this is generally 2 characters long, but can be 'Null' (no quotes)
# to search by country, or can be some value while city is 'Null' to search
# within the province or state.
province_or_state: "Texas"
# NOTE: this is the full city / town name, but can be 'Null', with
# province_or_state set to 'Null' to search country-wide.
city: "Richardson"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way the schema has been implemented, we can simply not provide province_or_state / city entirely and they will be set to the default value in the schema, which should be None.

i.e. user shouldn't be setting this to a Null string

radius: 25 # km (NOTE: if we were in locale: USA_ENGLISH it's in miles)
remote_within_country: False

# To search entire country if one is willing to relocate anywhere:
# province_or_state: Null
# city: Null
# remote_within_country: False
# To search within entire province or state (radius will not be accounted):
# province_or_state: (ie. Scotland)
# city: Null
# remote_within_country: False
# To search entire country for remote roles:
# province_or_state: Null
# city: Null
# remote_within_country: True

# These are the terms you would be typing into the website's search field:
keywords:
Expand Down
1 change: 0 additions & 1 deletion jobfunnel/backend/scrapers/glassdoor.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
"""
# Get the search url
search_url, data = self.get_search_url(method='post')

# Get the search page result.
request_html = self.session.post(search_url, data=data)
soup_base = BeautifulSoup(request_html.text, self.config.bs4_parser)
Expand Down
78 changes: 70 additions & 8 deletions jobfunnel/backend/scrapers/indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,42 @@ def _get_search_url(self, method: Optional[str] = 'get') -> str:
TODO: use Enum for method instead of str.
"""
if method == 'get':
return (
"https://www.indeed.{}/jobs?q={}&l={}%2C+{}&radius={}&"
"limit={}&filter={}{}".format(
search_url = "https://www.indeed.{}/jobs?q={}".format(
self.config.search_config.domain,
self.query,
self.query
)
# search countrywide supposing one is open to relocating within
# the country (includes remote)
if self.config.search_config.city is None and \
self.config.search_config.province_or_state is None and \
self.config.search_config.remote_within_country is False:
return search_url
# search remote within the country
if self.config.search_config.remote_within_country is True:
return (
search_url + "&l=remote&"
"limit={}&filter={}{}".format(
self.config.search_config.province_or_state.upper(),
self.max_results_per_page,
int(self.config.search_config.return_similar_results),
REMOTENESS_TO_QUERY[self.config.search_config.remoteness]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make more sense for us to construct the URL in an additive way vs this if/else and return method. This way we can easily add or remove portions and update to using a URL-construction library (Which helps with constructing the URL string from tags).

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i.e.

if search_config.province_or_state: 
    search_url += "...."

)
)
# search by state or province with no city
elif self.config.search_config.city is None and \
self.config.search_config.province_or_state:
return (
search_url + "&l={}&"
"limit={}&filter={}{}".format(
self.config.search_config.province_or_state.upper(),
self.max_results_per_page,
int(self.config.search_config.return_similar_results),
REMOTENESS_TO_QUERY[self.config.search_config.remoteness]
)
)
return (
search_url + "&l={}%2C+{}&radius={}&"
"limit={}&filter={}{}".format(
self.config.search_config.city.replace(' ', '+',),
self.config.search_config.province_or_state.upper(),
self._quantize_radius(self.config.search_config.radius),
Expand Down Expand Up @@ -338,11 +369,42 @@ def _get_search_url(self, method: Optional[str] = 'get') -> str:
TODO: use Enum for method instead of str.
"""
if method == 'get':
return (
"https://www.indeed.{}/jobs?q={}&l={}&radius={}&"
"limit={}&filter={}{}".format(
search_url = "https://www.indeed.{}/jobs?q={}".format(
self.config.search_config.domain,
self.query,
self.query
)
# search countrywide supposing one is open to relocating within
# the country (includes remote)
if self.config.search_config.city is None and \
self.config.search_config.province_or_state is None and \
self.config.search_config.remote_within_country is False:
return search_url
# search remote within the country
elif self.config.search_config.remote_within_country is True:
return (
search_url + "&l=remote&"
"limit={}&filter={}{}".format(
self.max_results_per_page,
int(self.config.search_config.return_similar_results),
REMOTENESS_TO_QUERY[self.config.search_config.remoteness]
)
)
# search by state or province, does not factor in city or radius
elif self.config.search_config.city is None and \
self.config.search_config.province_or_state and \
self.config.search_config.remote_within_country is False:
return (
search_url + "&l={}&"
"limit={}&filter={}{}".format(
self.config.search_config.province_or_state.upper(),
self.max_results_per_page,
int(self.config.search_config.return_similar_results),
REMOTENESS_TO_QUERY[self.config.search_config.remoteness],
)
)
return (
search_url + "&l={}&radius={}&"
"limit={}&filter={}{}".format(
self.config.search_config.city.replace(' ', '+',),
self._quantize_radius(self.config.search_config.radius),
self.max_results_per_page,
Expand Down
54 changes: 44 additions & 10 deletions jobfunnel/backend/scrapers/monster.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,15 +260,32 @@ def _get_search_url(self, method: Optional[str] = 'get',
all previous jobs as we go.
"""
if method == 'get':
return (
'https://www.monster.{}/jobs/search/?{}q={}&where={}__2C-{}'
'&rad={}'.format(
search_url = 'https://www.monster.{}/jobs/search/?{}q={}'.format(
self.config.search_config.domain,
f'page={page}&' if page > 1 else '',
self.query,
self.config.search_config.city.replace(' ', '-'),
)
# search countrywide supposing one is open to relocating within
# the country (includes remote)
if self.config.search_config.city is None and \
self.config.search_config.province_or_state is None and \
self.config.search_config.remote_within_country is False:
return search_url
# search remote within the country
elif self.config.search_config.remote_within_country is True:
return search_url + '-remote'
# search by state or province, does not factor in city or radius
elif self.config.search_config.city is None and \
self.config.search_config.province_or_state and \
self.config.search_config.remote_within_country is False:
return search_url + '&where={}'.format(
self.config.search_config.province_or_state,
self._convert_radius(self.config.search_config.radius)
)
return (
search_url + '&where={}__2C-{}&rad={}'.format(
self.config.search_config.city.replace(' ', '-'),
self.config.search_config.province_or_state,
self._convert_radius(self.config.search_config.radius)
Comment on lines +263 to +288
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might overcomplicate things at this stage. The config builder should process all user provided input from the CLI and YAML and create a valid configuration. This configuration should then be directly processed by the scrapers.

The potential issue I see with complicating the logic for setting up the URL is that it is a lot harder to extend the scrapers with different LOCALES. Right now, we try to optimise the scraper classes such that it is easy to support more LOCALES.

)
)
elif method == 'post':
Expand Down Expand Up @@ -360,14 +377,31 @@ def _get_search_url(self, method: Optional[str] = 'get',
all previous jobs as we go.
"""
if method == 'get':
return (
'https://www.monster.{}/jobs/search/?{}q={}&where={}'
'&rad={}'.format(
search_url = 'https://www.monster.{}/jobs/search/?{}q={}'.format(
self.config.search_config.domain,
f'page={page}&' if page > 1 else '',
self.query,
self.config.search_config.city.replace(' ', '-'),
self._convert_radius(self.config.search_config.radius)
)
# search countrywide supposing one is open to relocating within
# the country (includes remote)
if self.config.search_config.city is None and \
self.config.search_config.province_or_state is None and \
self.config.search_config.remote_within_country is False:
return search_url
# search remote within the country
elif self.config.search_config.remote_within_country is True:
return search_url + '-remote'
# search by state or province, does not factor in city or radius
elif self.config.search_config.city is None and \
self.config.search_config.province_or_state and \
self.config.search_config.remote_within_country is False:
return search_url + '&where={}'.format(
self.config.search_config.province_or_state,
)
return (
search_url + '&where={}&rad={}'.format(
self.config.search_config.city.replace(' ', '-'),
self._convert_radius(self.config.search_config.radius)
Comment on lines +384 to +404
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous comment

)
)
elif method == 'post':
Expand Down
19 changes: 15 additions & 4 deletions jobfunnel/config/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,18 +151,28 @@ def parse_cli(args: List[str]) -> Dict[str, Any]:
search_group.add_argument(
'-ps',
dest='search.province_or_state',
type=str,
type=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to make this an Optional[str] since we want to make sure this is a string if the user provides a value.

default=None,
help='Province/state value for your job-search area of interest. '
'(i.e. Ontario).',
required=True,
required=False,
)

search_group.add_argument(
'-c',
dest='search.city',
type=str,
type=None,
default=None,
help='City/town value for job-search region (i.e. Waterloo).',
required=True,
required=False,
)

search_group.add_argument(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit confused by the remote-in-city and remote-in-country arguments, why can't we simply provide a --remote argument?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried to make this part correspond 1:1 with the config.yaml.
If a city is indicated and remote is set, then remote will override the search.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it doesn't make sense to me why we need this, since if a user provides a country but no city with --remote it would have the intended effect no?

'-remote_in_ctry',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe use --remote-in-country.

dest='search.remote_within_country',
action='store_false',
help='Remote within locale',
required=False,
)

search_group.add_argument(
Expand Down Expand Up @@ -358,6 +368,7 @@ def get_config_manager(config: Dict[str, Any]) -> JobFunnelConfigManager:
blocked_company_names=config['search']['company_block_list'],
locale=Locale[config['search']['locale']],
providers=[Provider[p] for p in config['search']['providers']],
remote_within_country=config['search']['remote_within_country'],
remoteness=Remoteness[config['search']['remoteness']],
)

Expand Down
4 changes: 2 additions & 2 deletions jobfunnel/config/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(self,
max_listing_days: Optional[int] = None,
blocked_company_names: Optional[List[str]] = None,
domain: Optional[str] = None,
remote_within_country: bool = False,
remoteness: Optional[Remoteness] = Remoteness.ANY):
"""Search config for all job sources

Expand Down Expand Up @@ -56,6 +57,7 @@ def __init__(self,
self.return_similar_results = return_similar_results # Indeed.X thing
self.max_listing_days = max_listing_days or DEFAULT_MAX_LISTING_DAYS
self.blocked_company_names = blocked_company_names
self.remote_within_country = remote_within_country
self.remoteness = remoteness

# Try to infer the domain string based on the locale.
Expand All @@ -75,8 +77,6 @@ def query_string(self) -> str:
def validate(self):
"""We need to have the right information set, not mixing stuff
"""
assert self.province_or_state, "Province/State not set"
assert self.city, "City not set"
assert self.locale, "Locale not set"
assert self.providers and len(self.providers) >= 1, "Providers not set"
assert self.keywords and len(self.keywords) >= 1, "Keywords not set"
Expand Down
17 changes: 15 additions & 2 deletions jobfunnel/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,21 @@
'required': True,
'allowed': [l.name for l in Locale],
},
'province_or_state': {'required': True, 'type': 'string'},
'city': {'required': True, 'type': 'string'},
'province_or_state': {
'required': True,
'type': 'string',
'nullable': True
},
'city': {
'required': True,
'type': 'string',
'nullable': True
},
'remote_within_country': {
'required': True,
'type': 'boolean',
'default': DEFAULT_REMOTE_WITHIN_COUNTRY
},
'radius': {
'required': False,
'type': 'integer',
Expand Down
1 change: 1 addition & 0 deletions jobfunnel/resources/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
DEFAULT_RETURN_SIMILAR_RESULTS = False
DEFAULT_RANDOM_DELAY = False
DEFAULT_RANDOM_CONVERGING_DELAY = False
DEFAULT_REMOTE_WITHIN_COUNTRY = False
DEFAULT_REMOTENESS = Remoteness.ANY

# Defaults we use from localization, the scraper can always override it.
Expand Down