-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1012 from MC-shark/feature/improved_pdf
【feature】Commit Message: Optimized PyMuPDFScraper to handle invalid o…
- Loading branch information
Showing
1 changed file
with
52 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,65 @@ | ||
import os | ||
import requests | ||
import tempfile | ||
from urllib.parse import urlparse | ||
from langchain_community.document_loaders import PyMuPDFLoader | ||
|
||
|
||
class PyMuPDFScraper: | ||
|
||
def __init__(self, link, session=None): | ||
""" | ||
Initialize the scraper with a link and an optional session. | ||
Args: | ||
link (str): The URL or local file path of the PDF document. | ||
session (requests.Session, optional): An optional session for making HTTP requests. | ||
""" | ||
self.link = link | ||
self.session = session | ||
|
||
def is_url(self) -> bool: | ||
""" | ||
Check if the provided `link` is a valid URL. | ||
Returns: | ||
bool: True if the link is a valid URL, False otherwise. | ||
""" | ||
try: | ||
result = urlparse(self.link) | ||
return all([result.scheme, result.netloc]) # Check for valid scheme and network location | ||
except Exception: | ||
return False | ||
|
||
def scrape(self) -> str: | ||
""" | ||
The `scrape` function uses PyMuPDFLoader to load a document from a given link and returns it as | ||
a string. | ||
The `scrape` function uses PyMuPDFLoader to load a document from the provided link (either URL or local file) | ||
and returns the document as a string. | ||
Returns: | ||
The `scrape` method is returning a string representation of the `doc` object, which is loaded | ||
using PyMuPDFLoader from the provided link. | ||
str: A string representation of the loaded document. | ||
""" | ||
loader = PyMuPDFLoader(self.link) | ||
doc = loader.load() | ||
return str(doc) | ||
try: | ||
if self.is_url(): | ||
response = requests.get(self.link, timeout=5, stream=True) | ||
response.raise_for_status() | ||
|
||
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: | ||
temp_filename = temp_file.name # Get the temporary file name | ||
for chunk in response.iter_content(chunk_size=8192): | ||
temp_file.write(chunk) # Write the downloaded content to the temporary file | ||
|
||
loader = PyMuPDFLoader(temp_filename) | ||
doc = loader.load() | ||
|
||
os.remove(temp_filename) | ||
else: | ||
loader = PyMuPDFLoader(self.link) | ||
doc = loader.load() | ||
|
||
return str(doc) | ||
|
||
except requests.exceptions.Timeout: | ||
print(f"Download timed out. Please check the link : {self.link}") | ||
except Exception as e: | ||
print(f"Error loading PDF : {self.link} {e}") |