diff --git a/lib/ai_web_researcher/firecrawl_web_crawler.py b/lib/ai_web_researcher/firecrawl_web_crawler.py index 2068f636..1cb5cce6 100644 --- a/lib/ai_web_researcher/firecrawl_web_crawler.py +++ b/lib/ai_web_researcher/firecrawl_web_crawler.py @@ -1,34 +1,29 @@ import os from pathlib import Path - from firecrawl import FirecrawlApp import logging from dotenv import load_dotenv + # Load environment variables from .env file load_dotenv(Path('../../.env')) +# Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - -def initialize_client(): +def initialize_client() -> FirecrawlApp: """ Initialize and return a Firecrawl client. - Args: - api_key (str): Your Firecrawl API key. - Returns: - firecrawl.Client: An instance of the Firecrawl client. + FirecrawlApp: An instance of the Firecrawl client. """ return FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) - -def scrape_website(website_url, depth=1, max_pages=10): +def scrape_website(website_url: str, depth: int = 1, max_pages: int = 10) -> dict: """ Scrape a website starting from the given URL. Args: - api_key (str): Your Firecrawl API key. website_url (str): The URL of the website to scrape. depth (int, optional): The depth of crawling. Default is 1. max_pages (int, optional): The maximum number of pages to scrape. Default is 10. @@ -36,7 +31,7 @@ def scrape_website(website_url, depth=1, max_pages=10): Returns: dict: The result of the website scraping, or None if an error occurred. """ - client = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) + client = initialize_client() try: result = client.crawl_url({ 'url': website_url, @@ -44,43 +39,41 @@ def scrape_website(website_url, depth=1, max_pages=10): 'max_pages': max_pages }) return result + except KeyError as e: + logging.error(f"Missing key in data: {e}") + except ValueError as e: + logging.error(f"Value error: {e}") except Exception as e: logging.error(f"Error scraping website: {e}") - return None + return None - -def scrape_url(url): +def scrape_url(url: str) -> dict: """ Scrape a specific URL. Args: - api_key (str): Your Firecrawl API key. url (str): The URL to scrape. Returns: dict: The result of the URL scraping, or None if an error occurred. """ client = initialize_client() - #params = { - #'pageOptions': { - # 'onlyMainContent': True - # } - #} try: - #result = client.scrape_url(url, params=params) result = client.scrape_url(url) return result + except KeyError as e: + logging.error(f"Missing key in data: {e}") + except ValueError as e: + logging.error(f"Value error: {e}") except Exception as e: logging.error(f"Error scraping URL: {e}") - return None + return None - -def extract_data(url, schema): +def extract_data(url: str, schema: dict) -> dict: """ Extract structured data from a URL using the provided schema. Args: - api_key (str): Your Firecrawl API key. url (str): The URL to extract data from. schema (dict): The schema to use for data extraction. @@ -94,6 +87,10 @@ def extract_data(url, schema): 'schema': schema }) return result + except KeyError as e: + logging.error(f"Missing key in data: {e}") + except ValueError as e: + logging.error(f"Value error: {e}") except Exception as e: logging.error(f"Error extracting data: {e}") - return None + return None