-
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* added data extraction files * additional script files * parsing and processing file run_first.py * main.py * removed cohere dependency , update codes . (#261) (cherry picked from commit a1cc170) * removed tkinter , removed cohere dependent code * added resumes , jd with new name format
- Loading branch information
1 parent
a1cc170
commit 0b39f8c
Showing
23 changed files
with
1,078 additions
and
2,579 deletions.
There are no files selected for viewing
265 changes: 0 additions & 265 deletions
265
...n/JobDescription-job_desc_front_end_engineer.pdf9e508eff-a083-4e2d-8b1e-577cbc8f50fc.json
This file was deleted.
Oops, something went wrong.
261 changes: 0 additions & 261 deletions
261
.../JobDescription-job_desc_full_stack_engineer.pdfcccf72e0-5f20-4aa8-8679-d91b720b7247.json
This file was deleted.
Oops, something went wrong.
283 changes: 0 additions & 283 deletions
283
...ption/JobDescription-job_desc_java_developer.pdf1fb6435c-531a-4f04-84c2-c4e73e1f1a3f.json
This file was deleted.
Oops, something went wrong.
247 changes: 0 additions & 247 deletions
247
...tion/JobDescription-job_desc_product_manager.pdf0b613898-1183-49c5-ad07-e03bd9af72e5.json
This file was deleted.
Oops, something went wrong.
324 changes: 0 additions & 324 deletions
324
...rocessed/Resumes/Resume-alfred_pennyworth_pm.pdfb36fac00-1d60-49ba-a9e4-33477c928e98.json
This file was deleted.
Oops, something went wrong.
260 changes: 0 additions & 260 deletions
260
Data/Processed/Resumes/Resume-barry_allen_fe.pdfbef9d9b1-fa14-4aa0-8111-6033cceacb5a.json
This file was deleted.
Oops, something went wrong.
345 changes: 0 additions & 345 deletions
345
...ocessed/Resumes/Resume-bruce_wayne_fullstack.pdf04c228e3-04a4-4970-a149-a3cffc92aee8.json
This file was deleted.
Oops, something went wrong.
304 changes: 0 additions & 304 deletions
304
Data/Processed/Resumes/Resume-harvey_dent_mle.pdf5dde9183-5af4-4107-91e1-7b2d66fec490.json
This file was deleted.
Oops, something went wrong.
289 changes: 0 additions & 289 deletions
289
Data/Processed/Resumes/Resume-john_doe.pdf817fb796-14b9-4819-a224-7eed66b3ce04.json
This file was deleted.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
import re | ||
import urllib | ||
|
||
import spacy | ||
|
||
from resume_matcher.dataextractor.TextCleaner import TextCleaner | ||
|
||
# Load the English model | ||
nlp = spacy.load("en_core_web_md") | ||
|
||
RESUME_SECTIONS = [ | ||
"Contact Information", | ||
"Objective", | ||
"Summary", | ||
"Education", | ||
"Experience", | ||
"Skills", | ||
"Projects", | ||
"Certifications", | ||
"Licenses", | ||
"Awards", | ||
"Honors", | ||
"Publications", | ||
"References", | ||
"Technical Skills", | ||
"Computer Skills", | ||
"Programming Languages", | ||
"Software Skills", | ||
"Soft Skills", | ||
"Language Skills", | ||
"Professional Skills", | ||
"Transferable Skills", | ||
"Work Experience", | ||
"Professional Experience", | ||
"Employment History", | ||
"Internship Experience", | ||
"Volunteer Experience", | ||
"Leadership Experience", | ||
"Research Experience", | ||
"Teaching Experience", | ||
] | ||
|
||
|
||
class DataExtractor: | ||
""" | ||
A class for extracting various types of data from text. | ||
""" | ||
|
||
def __init__(self, raw_text: str): | ||
""" | ||
Initialize the DataExtractor object. | ||
Args: | ||
raw_text (str): The raw input text. | ||
""" | ||
|
||
self.text = raw_text | ||
self.clean_text = TextCleaner.clean_text(self.text) | ||
self.doc = nlp(self.clean_text) | ||
|
||
def extract_links(self): | ||
""" | ||
Find links of any type in a given string. | ||
Args: | ||
text (str): The string to search for links. | ||
Returns: | ||
list: A list containing all the found links. | ||
""" | ||
link_pattern = r"\b(?:https?://|www\.)\S+\b" | ||
links = re.findall(link_pattern, self.text) | ||
return links | ||
|
||
def extract_links_extended(self): | ||
""" | ||
Extract links of all kinds (HTTP, HTTPS, FTP, email, www.linkedin.com, | ||
and github.com/user_name) from a webpage. | ||
Args: | ||
url (str): The URL of the webpage. | ||
Returns: | ||
list: A list containing all the extracted links. | ||
""" | ||
links = [] | ||
try: | ||
response = urllib.request.urlopen(self.text) | ||
html_content = response.read().decode("utf-8") | ||
pattern = r'href=[\'"]?([^\'" >]+)' | ||
raw_links = re.findall(pattern, html_content) | ||
for link in raw_links: | ||
if link.startswith( | ||
( | ||
"http://", | ||
"https://", | ||
"ftp://", | ||
"mailto:", | ||
"www.linkedin.com", | ||
"github.com/", | ||
"twitter.com", | ||
) | ||
): | ||
links.append(link) | ||
except Exception as e: | ||
print(f"Error extracting links: {str(e)}") | ||
return links | ||
|
||
def extract_names(self): | ||
"""Extracts and returns a list of names from the given | ||
text using spaCy's named entity recognition. | ||
Args: | ||
text (str): The text to extract names from. | ||
Returns: | ||
list: A list of strings representing the names extracted from the text. | ||
""" | ||
names = [ent.text for ent in self.doc.ents if ent.label_ == "PERSON"] | ||
return names | ||
|
||
def extract_emails(self): | ||
""" | ||
Extract email addresses from a given string. | ||
Args: | ||
text (str): The string from which to extract email addresses. | ||
Returns: | ||
list: A list containing all the extracted email addresses. | ||
""" | ||
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" | ||
emails = re.findall(email_pattern, self.text) | ||
return emails | ||
|
||
def extract_phone_numbers(self): | ||
""" | ||
Extract phone numbers from a given string. | ||
Args: | ||
text (str): The string from which to extract phone numbers. | ||
Returns: | ||
list: A list containing all the extracted phone numbers. | ||
""" | ||
phone_number_pattern = ( | ||
r"^(\+\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$" | ||
) | ||
phone_numbers = re.findall(phone_number_pattern, self.text) | ||
return phone_numbers | ||
|
||
def extract_experience(self): | ||
""" | ||
Extract experience from a given string. It does so by using the Spacy module. | ||
Args: | ||
text (str): The string from which to extract experience. | ||
Returns: | ||
str: A string containing all the extracted experience. | ||
""" | ||
experience_section = [] | ||
in_experience_section = False | ||
|
||
for token in self.doc: | ||
if token.text in RESUME_SECTIONS: | ||
if token.text == "Experience" or "EXPERIENCE" or "experience": | ||
in_experience_section = True | ||
else: | ||
in_experience_section = False | ||
|
||
if in_experience_section: | ||
experience_section.append(token.text) | ||
|
||
return " ".join(experience_section) | ||
|
||
def extract_position_year(self): | ||
""" | ||
Extract position and year from a given string. | ||
Args: | ||
text (str): The string from which to extract position and year. | ||
Returns: | ||
list: A list containing the extracted position and year. | ||
""" | ||
position_year_search_pattern = ( | ||
r"(\b\w+\b\s+\b\w+\b),\s+(\d{4})\s*-\s*(\d{4}|\bpresent\b)" | ||
) | ||
position_year = re.findall(position_year_search_pattern, self.text) | ||
return position_year | ||
|
||
def extract_particular_words(self): | ||
""" | ||
Extract nouns and proper nouns from the given text. | ||
Args: | ||
text (str): The input text to extract nouns from. | ||
Returns: | ||
list: A list of extracted nouns. | ||
""" | ||
pos_tags = ["NOUN", "PROPN"] | ||
nouns = [token.text for token in self.doc if token.pos_ in pos_tags] | ||
return nouns | ||
|
||
def extract_entities(self): | ||
""" | ||
Extract named entities of types 'GPE' (geopolitical entity) and 'ORG' (organization) from the given text. | ||
Args: | ||
text (str): The input text to extract entities from. | ||
Returns: | ||
list: A list of extracted entities. | ||
""" | ||
entity_labels = ["GPE", "ORG"] | ||
entities = [ | ||
token.text for token in self.doc.ents if token.label_ in entity_labels | ||
] | ||
return list(set(entities)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import spacy | ||
import textacy | ||
from textacy import extract | ||
|
||
# Load the English model | ||
nlp = spacy.load("en_core_web_md") | ||
|
||
RESUME_SECTIONS = [ | ||
"Contact Information", | ||
"Objective", | ||
"Summary", | ||
"Education", | ||
"Experience", | ||
"Skills", | ||
"Projects", | ||
"Certifications", | ||
"Licenses", | ||
"Awards", | ||
"Honors", | ||
"Publications", | ||
"References", | ||
"Technical Skills", | ||
"Computer Skills", | ||
"Programming Languages", | ||
"Software Skills", | ||
"Soft Skills", | ||
"Language Skills", | ||
"Professional Skills", | ||
"Transferable Skills", | ||
"Work Experience", | ||
"Professional Experience", | ||
"Employment History", | ||
"Internship Experience", | ||
"Volunteer Experience", | ||
"Leadership Experience", | ||
"Research Experience", | ||
"Teaching Experience", | ||
] | ||
|
||
REGEX_PATTERNS = { | ||
"email_pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", | ||
"phone_pattern": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", | ||
"link_pattern": r"\b(?:https?://|www\.)\S+\b", | ||
} | ||
|
||
READ_RESUME_FROM = "Data/Resumes/" | ||
SAVE_DIRECTORY_RESUME = "Data/Processed/Resumes" | ||
|
||
READ_JOB_DESCRIPTION_FROM = "Data/JobDescription/" | ||
SAVE_DIRECTORY_JOB_DESCRIPTION = "Data/Processed/JobDescription" | ||
|
||
|
||
class KeytermExtractor: | ||
""" | ||
A class for extracting keyterms from a given text using various algorithms. | ||
""" | ||
|
||
def __init__(self, raw_text: str, top_n_values: int = 20): | ||
""" | ||
Initialize the KeytermExtractor object. | ||
Args: | ||
raw_text (str): The raw input text. | ||
top_n_values (int): The number of top keyterms to extract. | ||
""" | ||
self.raw_text = raw_text | ||
self.text_doc = textacy.make_spacy_doc(self.raw_text, lang="en_core_web_md") | ||
self.top_n_values = top_n_values | ||
|
||
def get_keyterms_based_on_textrank(self): | ||
""" | ||
Extract keyterms using the TextRank algorithm. | ||
Returns: | ||
List[str]: A list of top keyterms based on TextRank. | ||
""" | ||
return list( | ||
extract.keyterms.textrank( | ||
self.text_doc, normalize="lemma", topn=self.top_n_values | ||
) | ||
) | ||
|
||
def get_keyterms_based_on_sgrank(self): | ||
""" | ||
Extract keyterms using the SGRank algorithm. | ||
Returns: | ||
List[str]: A list of top keyterms based on SGRank. | ||
""" | ||
return list( | ||
extract.keyterms.sgrank( | ||
self.text_doc, normalize="lemma", topn=self.top_n_values | ||
) | ||
) | ||
|
||
def get_keyterms_based_on_scake(self): | ||
""" | ||
Extract keyterms using the sCAKE algorithm. | ||
Returns: | ||
List[str]: A list of top keyterms based on sCAKE. | ||
""" | ||
return list( | ||
extract.keyterms.scake( | ||
self.text_doc, normalize="lemma", topn=self.top_n_values | ||
) | ||
) | ||
|
||
def get_keyterms_based_on_yake(self): | ||
""" | ||
Extract keyterms using the YAKE algorithm. | ||
Returns: | ||
List[str]: A list of top keyterms based on YAKE. | ||
""" | ||
return list( | ||
extract.keyterms.yake( | ||
self.text_doc, normalize="lemma", topn=self.top_n_values | ||
) | ||
) | ||
|
||
def bi_gramchunker(self): | ||
""" | ||
Chunk the text into bigrams. | ||
Returns: | ||
List[str]: A list of bigrams. | ||
""" | ||
return list( | ||
textacy.extract.basics.ngrams( | ||
self.text_doc, | ||
n=2, | ||
filter_stops=True, | ||
filter_nums=True, | ||
filter_punct=True, | ||
) | ||
) | ||
|
||
def tri_gramchunker(self): | ||
""" | ||
Chunk the text into trigrams. | ||
Returns: | ||
List[str]: A list of trigrams. | ||
""" | ||
return list( | ||
textacy.extract.basics.ngrams( | ||
self.text_doc, | ||
n=3, | ||
filter_stops=True, | ||
filter_nums=True, | ||
filter_punct=True, | ||
) | ||
) |
Oops, something went wrong.