Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【feature】Commit Message: Optimized PyMuPDFScraper to handle invalid o… #1012

Merged
merged 1 commit into from
Dec 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 52 additions & 8 deletions gpt_researcher/scraper/pymupdf/pymupdf.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,65 @@
import os
import requests
import tempfile
from urllib.parse import urlparse
from langchain_community.document_loaders import PyMuPDFLoader


class PyMuPDFScraper:

def __init__(self, link, session=None):
"""
Initialize the scraper with a link and an optional session.

Args:
link (str): The URL or local file path of the PDF document.
session (requests.Session, optional): An optional session for making HTTP requests.
"""
self.link = link
self.session = session

def is_url(self) -> bool:
"""
Check if the provided `link` is a valid URL.

Returns:
bool: True if the link is a valid URL, False otherwise.
"""
try:
result = urlparse(self.link)
return all([result.scheme, result.netloc]) # Check for valid scheme and network location
except Exception:
return False

def scrape(self) -> str:
"""
The `scrape` function uses PyMuPDFLoader to load a document from a given link and returns it as
a string.
The `scrape` function uses PyMuPDFLoader to load a document from the provided link (either URL or local file)
and returns the document as a string.

Returns:
The `scrape` method is returning a string representation of the `doc` object, which is loaded
using PyMuPDFLoader from the provided link.
str: A string representation of the loaded document.
"""
loader = PyMuPDFLoader(self.link)
doc = loader.load()
return str(doc)
try:
if self.is_url():
response = requests.get(self.link, timeout=5, stream=True)
response.raise_for_status()

with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_filename = temp_file.name # Get the temporary file name
for chunk in response.iter_content(chunk_size=8192):
temp_file.write(chunk) # Write the downloaded content to the temporary file

loader = PyMuPDFLoader(temp_filename)
doc = loader.load()

os.remove(temp_filename)
else:
loader = PyMuPDFLoader(self.link)
doc = loader.load()

return str(doc)

except requests.exceptions.Timeout:
print(f"Download timed out. Please check the link : {self.link}")
except Exception as e:
print(f"Error loading PDF : {self.link} {e}")