Files
ALwrity/ToBeMigrated/ai_web_researcher/firecrawl_web_crawler.py
2025-08-06 16:29:49 +05:30

97 lines
2.7 KiB
Python

import os
from pathlib import Path
from firecrawl import FirecrawlApp
import logging
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def initialize_client() -> FirecrawlApp:
"""
Initialize and return a Firecrawl client.
Returns:
FirecrawlApp: An instance of the Firecrawl client.
"""
return FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
def scrape_website(website_url: str, depth: int = 1, max_pages: int = 10) -> dict:
"""
Scrape a website starting from the given URL.
Args:
website_url (str): The URL of the website to scrape.
depth (int, optional): The depth of crawling. Default is 1.
max_pages (int, optional): The maximum number of pages to scrape. Default is 10.
Returns:
dict: The result of the website scraping, or None if an error occurred.
"""
client = initialize_client()
try:
result = client.crawl_url({
'url': website_url,
'depth': depth,
'max_pages': max_pages
})
return result
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error scraping website: {e}")
return None
def scrape_url(url: str) -> dict:
"""
Scrape a specific URL.
Args:
url (str): The URL to scrape.
Returns:
dict: The result of the URL scraping, or None if an error occurred.
"""
client = initialize_client()
try:
result = client.scrape_url(url)
return result
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error scraping URL: {e}")
return None
def extract_data(url: str, schema: dict) -> dict:
"""
Extract structured data from a URL using the provided schema.
Args:
url (str): The URL to extract data from.
schema (dict): The schema to use for data extraction.
Returns:
dict: The extracted data, or None if an error occurred.
"""
client = initialize_client()
try:
result = client.extract({
'url': url,
'schema': schema
})
return result
except KeyError as e:
logging.error(f"Missing key in data: {e}")
except ValueError as e:
logging.error(f"Value error: {e}")
except Exception as e:
logging.error(f"Error extracting data: {e}")
return None