diff --git a/lib/ai_writers/ai_blog_faqs_writer/README.md b/lib/ai_writers/ai_blog_faqs_writer/README.md
new file mode 100644
index 00000000..6023e0c7
--- /dev/null
+++ b/lib/ai_writers/ai_blog_faqs_writer/README.md
@@ -0,0 +1,192 @@
+# AI-Powered FAQ Generator
+
+A sophisticated FAQ generation system that creates comprehensive, well-researched FAQs from various content sources. This tool leverages AI to analyze content, conduct web research, and generate detailed FAQs with customizable options.
+
+## Features
+
+### Content Processing
+- **Multiple Input Sources**
+ - Direct text input
+ - File uploads (DOCX, TXT)
+ - URL content extraction
+ - Support for any content type (general, technical, educational, etc.)
+
+### Research Capabilities
+- **Multi-level Search Depth**
+ - **Basic**: Google Search for quick, general information
+ - **Comprehensive**: Tavily AI for detailed, in-depth research
+ - **Expert**: Metaphor AI for specialized, expert-level content
+
+### Customization Options
+- **Target Audience**
+ - Beginner
+ - Intermediate
+ - Expert
+
+- **FAQ Style**
+ - Technical
+ - Conversational
+ - Professional
+
+- **Advanced Features**
+ - Emoji inclusion
+ - Code example generation
+ - Reference integration
+ - Customizable time range for research
+ - Multi-language support
+
+### Output Formats
+- Interactive preview
+- Markdown
+- HTML
+- JSON
+
+## Installation
+
+1. Clone the repository
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+### Basic Usage
+```python
+from lib.ai_writers.ai_blog_faqs_writer.faqs_generator_blog import FAQGenerator, FAQConfig
+
+# Initialize with default configuration
+generator = FAQGenerator()
+
+# Generate FAQs from content
+faqs = await generator.generate_faqs("Your content here")
+```
+
+### Advanced Configuration
+```python
+from lib.ai_writers.ai_blog_faqs_writer.faqs_generator_blog import (
+ FAQGenerator, FAQConfig, TargetAudience, FAQStyle, SearchDepth
+)
+
+# Custom configuration
+config = FAQConfig(
+ num_faqs=10,
+ target_audience=TargetAudience.INTERMEDIATE,
+ faq_style=FAQStyle.TECHNICAL,
+ include_emojis=True,
+ include_code_examples=True,
+ include_references=True,
+ search_depth=SearchDepth.COMPREHENSIVE,
+ time_range="last_6_months",
+ language="English"
+)
+
+generator = FAQGenerator(config)
+```
+
+### Web Interface
+Run the Streamlit interface:
+```bash
+streamlit run lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py
+```
+
+## Research Process
+
+1. **Content Analysis**
+ - Identifies key topics and concepts
+ - Extracts potential questions
+ - Determines research requirements
+
+2. **Web Research**
+ - Selects appropriate search function based on depth
+ - Gathers relevant information
+ - Validates and cross-references data
+
+3. **FAQ Generation**
+ - Creates comprehensive questions
+ - Provides detailed answers
+ - Includes code examples (if applicable)
+ - Adds references and citations
+
+## Output Structure
+
+Each FAQ item includes:
+- Question
+- Detailed answer
+- Category
+- Code example (if applicable)
+- References
+- Confidence score
+- Last updated timestamp
+
+## Configuration Options
+
+### FAQConfig Parameters
+- `num_faqs`: Number of FAQs to generate (default: 5)
+- `target_audience`: Target audience level (default: INTERMEDIATE)
+- `faq_style`: Writing style (default: PROFESSIONAL)
+- `include_emojis`: Whether to include emojis (default: True)
+- `include_code_examples`: Whether to include code examples (default: True)
+- `include_references`: Whether to include references (default: True)
+- `search_depth`: Research depth level (default: COMPREHENSIVE)
+- `time_range`: Time range for research (default: "last_6_months")
+- `language`: Output language (default: "English")
+
+## Research Depth Options
+
+### Basic (Google Search)
+- Quick, general information
+- Broad coverage
+- Suitable for basic topics
+
+### Comprehensive (Tavily AI)
+- Detailed, in-depth research
+- Multiple source integration
+- Best for most use cases
+
+### Expert (Metaphor AI)
+- Specialized, expert-level content
+- Advanced topic coverage
+- Technical and academic focus
+
+## Best Practices
+
+1. **Content Preparation**
+ - Provide clear, well-structured content
+ - Include key terms and concepts
+ - Specify target audience and style
+
+2. **Research Selection**
+ - Use Basic for general topics
+ - Choose Comprehensive for detailed analysis
+ - Select Expert for technical subjects
+
+3. **Output Review**
+ - Verify accuracy of information
+ - Check code examples
+ - Validate references
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Commit your changes
+4. Push to the branch
+5. Create a Pull Request
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
+## Support
+
+For support, please open an issue in the repository or contact the maintainers.
+
+## Acknowledgments
+
+- OpenAI for GPT integration
+- Google Search API
+- Tavily AI
+- Metaphor AI
+- BeautifulSoup for web scraping
+- Streamlit for UI
\ No newline at end of file
diff --git a/lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py b/lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py
new file mode 100644
index 00000000..8a8f8ea6
--- /dev/null
+++ b/lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py
@@ -0,0 +1,386 @@
+"""
+Enhanced FAQ Generator
+
+This module provides a comprehensive FAQ generation system that can create detailed,
+well-researched FAQs from various content sources with customizable options.
+"""
+
+import sys
+import json
+from typing import Dict, List, Optional, Union
+from pathlib import Path
+from enum import Enum
+from dataclasses import dataclass
+from loguru import logger
+
+from lib.gpt_providers.text_generation.main_text_generation import llm_text_gen
+from lib.ai_web_researcher.google_serp_search import google_search
+from lib.ai_web_researcher.tavily_ai_search import tavily_search
+from lib.ai_web_researcher.metaphor_basic_neural_web_search import metaphor_search_articles
+
+logger.remove()
+logger.add(sys.stdout,
+ colorize=True,
+ format="{level}|{file}:{line}:{function}| {message}")
+
+class TargetAudience(Enum):
+ BEGINNER = "beginner"
+ INTERMEDIATE = "intermediate"
+ EXPERT = "expert"
+
+class FAQStyle(Enum):
+ TECHNICAL = "technical"
+ CONVERSATIONAL = "conversational"
+ PROFESSIONAL = "professional"
+
+class SearchDepth(Enum):
+ BASIC = "basic"
+ COMPREHENSIVE = "comprehensive"
+ EXPERT = "expert"
+
+@dataclass
+class FAQConfig:
+ """Configuration for FAQ generation."""
+ num_faqs: int = 5
+ target_audience: TargetAudience = TargetAudience.INTERMEDIATE
+ faq_style: FAQStyle = FAQStyle.PROFESSIONAL
+ include_emojis: bool = True
+ include_code_examples: bool = True
+ include_references: bool = True
+ search_depth: SearchDepth = SearchDepth.COMPREHENSIVE
+ time_range: str = "last_6_months"
+ exclude_domains: List[str] = None
+ language: str = "English"
+
+@dataclass
+class FAQItem:
+ """Individual FAQ item with metadata."""
+ question: str
+ answer: str
+ category: str
+ code_example: Optional[str] = None
+ references: List[Dict[str, str]] = None
+ confidence_score: float = 0.0
+ last_updated: str = None
+
+class FAQGenerator:
+ """Enhanced FAQ Generator with research capabilities."""
+
+ def __init__(self, config: Optional[FAQConfig] = None):
+ """Initialize the FAQ generator with optional configuration."""
+ self.config = config or FAQConfig()
+ self.faqs: List[FAQItem] = []
+ self.research_results = {}
+
+ async def generate_faqs(self, content: str, content_type: str = "general") -> List[FAQItem]:
+ """Generate FAQs from the given content with research integration."""
+ try:
+ # Step 1: Research the topic
+ research_results = await self._conduct_research(content)
+
+ # Step 2: Generate initial FAQs
+ initial_faqs = await self._generate_initial_faqs(content, research_results)
+
+ # Step 3: Enhance FAQs with research
+ enhanced_faqs = await self._enhance_faqs_with_research(initial_faqs, research_results)
+
+ # Step 4: Add code examples if requested
+ if self.config.include_code_examples:
+ enhanced_faqs = await self._add_code_examples(enhanced_faqs)
+
+ # Step 5: Add references if requested
+ if self.config.include_references:
+ enhanced_faqs = await self._add_references(enhanced_faqs, research_results)
+
+ self.faqs = enhanced_faqs
+ return enhanced_faqs
+
+ except Exception as err:
+ logger.error(f"Failed to generate FAQs: {err}")
+ raise
+
+ async def _conduct_research(self, content: str) -> Dict:
+ """Conduct online research based on the content."""
+ try:
+ research_prompt = f"""Based on the following content, identify key topics and questions for research:
+ {content}
+
+ Please provide a list of research topics and questions that would help create comprehensive FAQs.
+ Focus on:
+ 1. Key concepts and terms
+ 2. Common questions users might have
+ 3. Technical aspects that need clarification
+ 4. Best practices and recommendations
+ """
+
+ research_topics = await llm_text_gen(research_prompt)
+
+ # Conduct research for each topic
+ research_results = {}
+ for topic in research_topics.split('\n'):
+ if topic.strip():
+ # Select search function based on search depth
+ if self.config.search_depth == SearchDepth.BASIC:
+ results = await google_search(topic.strip())
+ elif self.config.search_depth == SearchDepth.COMPREHENSIVE:
+ results = await tavily_search(topic.strip())
+ elif self.config.search_depth == SearchDepth.EXPERT:
+ results = await metaphor_search_articles(topic.strip())
+ else:
+ logger.warning(f"Unknown search depth: {self.config.search_depth}, defaulting to Google search")
+ results = await google_search(topic.strip())
+
+ research_results[topic.strip()] = results
+
+ return research_results
+
+ except Exception as err:
+ logger.error(f"Failed to conduct research: {err}")
+ return {}
+
+ async def _generate_initial_faqs(self, content: str, research_results: Dict) -> List[FAQItem]:
+ """Generate initial FAQs using LLM."""
+ try:
+ system_prompt = f"""You are an expert FAQ generator with deep knowledge in content creation and technical writing.
+ Your task is to create comprehensive FAQs based on the given content and research.
+
+ Guidelines:
+ 1. Target Audience: {self.config.target_audience.value}
+ 2. Style: {self.config.faq_style.value}
+ 3. Include emojis: {self.config.include_emojis}
+ 4. Language: {self.config.language}
+ 5. Number of FAQs: {self.config.num_faqs}
+
+ Create FAQs that are:
+ - Clear and concise
+ - Well-structured
+ - Technically accurate
+ - Engaging and informative
+ - Based on the provided research
+ - Relevant to the target audience
+ - Written in the specified style
+ """
+
+ prompt = f"""Content to generate FAQs from:
+ {content}
+
+ Research Results:
+ {json.dumps(research_results, indent=2)}
+
+ Please generate {self.config.num_faqs} FAQs following the guidelines above.
+ Format each FAQ with:
+ - Question
+ - Detailed answer
+ - Category
+ - Confidence score (0-1)
+ """
+
+ response = await llm_text_gen(prompt, system_prompt=system_prompt)
+
+ # Parse the response into FAQItem objects
+ faqs = []
+ current_faq = None
+
+ for line in response.split('\n'):
+ if line.startswith('Q:'):
+ if current_faq:
+ faqs.append(current_faq)
+ current_faq = FAQItem(question=line[2:].strip(), answer="", category="")
+ elif line.startswith('A:'):
+ if current_faq:
+ current_faq.answer = line[2:].strip()
+ elif line.startswith('Category:'):
+ if current_faq:
+ current_faq.category = line[9:].strip()
+ elif line.startswith('Confidence:'):
+ if current_faq:
+ current_faq.confidence_score = float(line[11:].strip())
+
+ if current_faq:
+ faqs.append(current_faq)
+
+ return faqs
+
+ except Exception as err:
+ logger.error(f"Failed to generate initial FAQs: {err}")
+ raise
+
+ async def _enhance_faqs_with_research(self, faqs: List[FAQItem], research_results: Dict) -> List[FAQItem]:
+ """Enhance FAQs with research findings."""
+ try:
+ enhanced_faqs = []
+
+ for faq in faqs:
+ # Find relevant research for this FAQ
+ relevant_research = self._find_relevant_research(faq, research_results)
+
+ if relevant_research:
+ # Enhance the answer with research findings
+ enhancement_prompt = f"""Enhance the following FAQ answer with the provided research:
+
+ Question: {faq.question}
+ Current Answer: {faq.answer}
+
+ Research:
+ {json.dumps(relevant_research, indent=2)}
+
+ Please enhance the answer while:
+ 1. Maintaining the original style and tone
+ 2. Adding relevant information from the research
+ 3. Ensuring technical accuracy
+ 4. Keeping the answer concise and clear
+ """
+
+ enhanced_answer = await llm_text_gen(enhancement_prompt)
+ faq.answer = enhanced_answer
+
+ enhanced_faqs.append(faq)
+
+ return enhanced_faqs
+
+ except Exception as err:
+ logger.error(f"Failed to enhance FAQs with research: {err}")
+ return faqs
+
+ async def _add_code_examples(self, faqs: List[FAQItem]) -> List[FAQItem]:
+ """Add code examples to FAQs where applicable."""
+ try:
+ for faq in faqs:
+ if self._is_technical_question(faq.question):
+ code_prompt = f"""Generate a code example for the following FAQ:
+
+ Question: {faq.question}
+ Answer: {faq.answer}
+
+ Please provide a relevant code example that:
+ 1. Illustrates the answer clearly
+ 2. Includes comments and explanations
+ 3. Follows best practices
+ 4. Is easy to understand
+ """
+
+ code_example = await llm_text_gen(code_prompt)
+ faq.code_example = code_example
+
+ return faqs
+
+ except Exception as err:
+ logger.error(f"Failed to add code examples: {err}")
+ return faqs
+
+ async def _add_references(self, faqs: List[FAQItem], research_results: Dict) -> List[FAQItem]:
+ """Add references to FAQs."""
+ try:
+ for faq in faqs:
+ relevant_research = self._find_relevant_research(faq, research_results)
+ if relevant_research:
+ faq.references = [
+ {
+ "title": ref.get("title", ""),
+ "url": ref.get("url", ""),
+ "source": ref.get("source", ""),
+ "date": ref.get("date", "")
+ }
+ for ref in relevant_research.get("references", [])
+ ]
+
+ return faqs
+
+ except Exception as err:
+ logger.error(f"Failed to add references: {err}")
+ return faqs
+
+ def _find_relevant_research(self, faq: FAQItem, research_results: Dict) -> Dict:
+ """Find research relevant to a specific FAQ."""
+ # Simple keyword matching for now - can be enhanced with semantic search
+ relevant_research = {}
+ for topic, results in research_results.items():
+ if any(keyword in faq.question.lower() for keyword in topic.lower().split()):
+ relevant_research[topic] = results
+ return relevant_research
+
+ def _is_technical_question(self, question: str) -> bool:
+ """Determine if a question is technical and might benefit from a code example."""
+ technical_keywords = ["code", "program", "function", "method", "class", "api", "syntax", "error", "debug"]
+ return any(keyword in question.lower() for keyword in technical_keywords)
+
+ def to_markdown(self) -> str:
+ """Convert FAQs to markdown format."""
+ markdown = "# Frequently Asked Questions\n\n"
+
+ for i, faq in enumerate(self.faqs, 1):
+ markdown += f"## {i}. {faq.question}\n\n"
+ markdown += f"{faq.answer}\n\n"
+
+ if faq.code_example:
+ markdown += "```\n"
+ markdown += f"{faq.code_example}\n"
+ markdown += "```\n\n"
+
+ if faq.references:
+ markdown += "### References\n"
+ for ref in faq.references:
+ markdown += f"- [{ref['title']}]({ref['url']}) - {ref['source']} ({ref['date']})\n"
+ markdown += "\n"
+
+ return markdown
+
+ def to_html(self) -> str:
+ """Convert FAQs to HTML format."""
+ html = """
+
+
+
+ Frequently Asked Questions
+
+
+
+
+
Frequently Asked Questions
+ """
+
+ for i, faq in enumerate(self.faqs, 1):
+ html += f"""
+
+
{i}. {faq.question}
+
{faq.answer}
+ """
+
+ if faq.code_example:
+ html += f"""
+
{faq.code_example}
+ """
+
+ if faq.references:
+ html += """
+
+
References
+
+ """
+ for ref in faq.references:
+ html += f"""
+ - {ref['title']} - {ref['source']} ({ref['date']})
+ """
+ html += """
+
+
+ """
+
+ html += """
+
+ """
+
+ html += """
+
+
+
+ """
+
+ return html
diff --git a/lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py b/lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py
new file mode 100644
index 00000000..720cb91b
--- /dev/null
+++ b/lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py
@@ -0,0 +1,177 @@
+"""
+Streamlit UI for FAQ Generator
+
+This module provides a user-friendly interface for generating FAQs from various content sources.
+"""
+
+import streamlit as st
+import asyncio
+from pathlib import Path
+from typing import Optional
+import json
+import requests
+from bs4 import BeautifulSoup
+
+from .faqs_generator_blog import FAQGenerator, FAQConfig, TargetAudience, FAQStyle, SearchDepth
+
+
+def fetch_url_content(url):
+ """Fetch and extract content from a URL."""
+ try:
+ response = requests.get(url)
+ response.raise_for_status()
+ soup = BeautifulSoup(response.text, 'html.parser')
+
+ # Remove script and style elements
+ for script in soup(["script", "style"]):
+ script.decompose()
+
+ # Get text
+ text = soup.get_text()
+
+ # Break into lines and remove leading and trailing space
+ lines = (line.strip() for line in text.splitlines())
+ # Break multi-headlines into a line each
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
+ # Drop blank lines
+ text = '\n'.join(chunk for chunk in chunks if chunk)
+
+ return text
+ except Exception as e:
+ st.error(f"Error fetching URL content: {str(e)}")
+ return None
+
+def main():
+ st.set_page_config(
+ page_title="FAQ Generator",
+ page_icon="❓",
+ layout="wide"
+ )
+
+ st.title("FAQ Generator")
+ st.markdown("Generate comprehensive FAQs from your content with research integration.")
+
+ # Sidebar for configuration
+ with st.sidebar:
+ st.header("Configuration")
+
+ # Basic settings
+ num_faqs = st.slider("Number of FAQs", 1, 20, 5)
+ target_audience = st.selectbox(
+ "Target Audience",
+ [audience.value for audience in TargetAudience]
+ )
+ faq_style = st.selectbox(
+ "FAQ Style",
+ [style.value for style in FAQStyle]
+ )
+
+ # Advanced settings
+ with st.expander("Advanced Settings"):
+ include_emojis = st.checkbox("Include Emojis", value=True)
+ include_code_examples = st.checkbox("Include Code Examples", value=True)
+ include_references = st.checkbox("Include References", value=True)
+
+ search_depth = st.selectbox(
+ "Search Depth",
+ [depth.value for depth in SearchDepth]
+ )
+ time_range = st.selectbox(
+ "Time Range",
+ ["last_month", "last_6_months", "last_year", "all_time"]
+ )
+ language = st.text_input("Language", value="English")
+
+ # Main content area
+ content_type = st.radio(
+ "Content Source",
+ ["Direct Input", "File Upload", "URL"]
+ )
+
+ content = ""
+ if content_type == "Direct Input":
+ content = st.text_area("Enter your content", height=300)
+
+ elif content_type == "URL":
+ url = st.text_input("Enter URL")
+ if url:
+ content = fetch_url_content(url)
+ if content:
+ st.text_area("Extracted Content", content, height=300)
+
+ # Generate button
+ if st.button("Generate FAQs") and content:
+ try:
+ # Create config
+ config = FAQConfig(
+ num_faqs=num_faqs,
+ target_audience=TargetAudience(target_audience),
+ faq_style=FAQStyle(faq_style),
+ include_emojis=include_emojis,
+ include_code_examples=include_code_examples,
+ include_references=include_references,
+ search_depth=SearchDepth(search_depth),
+ time_range=time_range,
+ language=language
+ )
+
+ # Initialize generator
+ generator = FAQGenerator(config)
+
+ # Generate FAQs
+ with st.spinner("Generating FAQs..."):
+ faqs = asyncio.run(generator.generate_faqs(content))
+
+ # Display results
+ st.success("FAQs generated successfully!")
+
+ # Output format selection
+ output_format = st.radio(
+ "Output Format",
+ ["Preview", "Markdown", "HTML", "JSON"]
+ )
+
+ if output_format == "Preview":
+ for i, faq in enumerate(faqs, 1):
+ with st.expander(f"{i}. {faq.question}"):
+ st.markdown(faq.answer)
+ if faq.code_example:
+ st.code(faq.code_example)
+ if faq.references:
+ st.markdown("**References:**")
+ for ref in faq.references:
+ st.markdown(f"- [{ref['title']}]({ref['url']}) - {ref['source']} ({ref['date']})")
+
+ elif output_format == "Markdown":
+ st.code(generator.to_markdown(), language="markdown")
+ st.download_button(
+ "Download Markdown",
+ generator.to_markdown(),
+ file_name="faqs.md",
+ mime="text/markdown"
+ )
+
+ elif output_format == "HTML":
+ st.code(generator.to_html(), language="html")
+ st.download_button(
+ "Download HTML",
+ generator.to_html(),
+ file_name="faqs.html",
+ mime="text/html"
+ )
+
+ elif output_format == "JSON":
+ json_output = json.dumps([faq.__dict__ for faq in faqs], indent=2)
+ st.code(json_output, language="json")
+ st.download_button(
+ "Download JSON",
+ json_output,
+ file_name="faqs.json",
+ mime="application/json"
+ )
+
+ except Exception as e:
+ st.error(f"Error generating FAQs: {str(e)}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/lib/ai_writers/ai_writer_dashboard.py b/lib/ai_writers/ai_writer_dashboard.py
index a4f116fe..ba157a9a 100644
--- a/lib/ai_writers/ai_writer_dashboard.py
+++ b/lib/ai_writers/ai_writer_dashboard.py
@@ -6,7 +6,7 @@ from lib.ai_writers.ai_product_description_writer import write_ai_prod_desc
from lib.ai_writers.ai_copywriter.copywriter_dashboard import copywriter_dashboard
from lib.ai_writers.linkedin_writer import LinkedInAIWriter
from lib.ai_writers.blog_rewriter_updater.ai_blog_rewriter import write_blog_rewriter
-#from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_content_planner
+from lib.ai_writers.ai_blog_faqs_writer.faqs_ui import main as faqs_generator
from lib.ai_writers.ai_blog_writer.ai_blog_generator import ai_blog_writer_page
from loguru import logger
@@ -84,6 +84,14 @@ def list_ai_writers():
"category": "Professional",
"function": lambda: LinkedInAIWriter().run(),
"path": "linkedin_writer"
+ },
+ {
+ "name": "FAQ Generator",
+ "icon": "❓",
+ "description": "Generate comprehensive, well-researched FAQs from any content source with customizable options",
+ "category": "Content Creation",
+ "function": faqs_generator,
+ "path": "faqs_generator"
}
]
diff --git a/lib/ai_writers/github_blogs/README.md b/lib/ai_writers/github_blogs/README.md
new file mode 100644
index 00000000..194490a1
--- /dev/null
+++ b/lib/ai_writers/github_blogs/README.md
@@ -0,0 +1,259 @@
+# GitHub Blog Generator
+
+A powerful AI-powered content generation system that automatically creates comprehensive documentation, tutorials, and guides from GitHub repositories. This module transforms GitHub repository data into various types of high-quality technical content.
+
+## Features
+
+### 1. Content Generation Types
+
+The system can generate the following types of content from GitHub repositories:
+
+- **Getting Started Guides**
+ - Introduction and Overview
+ - Prerequisites and Setup
+ - Installation Instructions
+ - Basic Usage Examples
+ - Common Use Cases
+ - Best Practices
+ - Next Steps and Resources
+
+- **Technical Documentation**
+ - Architecture Overview
+ - Core Components
+ - Technical Specifications
+ - Integration Points
+ - Performance Considerations
+ - Security Features
+ - API Documentation
+ - Configuration Options
+ - Deployment Guidelines
+ - Troubleshooting Guide
+
+- **Tutorial Series**
+ - Beginner Tutorials
+ - Basic concepts
+ - Simple examples
+ - Step-by-step instructions
+ - Intermediate Tutorials
+ - Advanced features
+ - Real-world examples
+ - Best practices
+ - Advanced Tutorials
+ - Complex use cases
+ - Performance optimization
+ - Integration patterns
+
+- **Comparison Analysis**
+ - Feature Comparison
+ - Performance Analysis
+ - Use Case Suitability
+ - Community and Support
+ - Learning Curve
+ - Integration Capabilities
+ - Future Prospects
+
+- **Case Studies**
+ - Problem Statement
+ - Solution Implementation
+ - Technical Challenges
+ - Results and Benefits
+ - Lessons Learned
+ - Future Improvements
+
+- **Contribution Guides**
+ - Development Setup
+ - Code Style Guidelines
+ - Testing Requirements
+ - Documentation Standards
+ - Pull Request Process
+ - Review Guidelines
+ - Community Guidelines
+
+- **Security Guides**
+ - Security Architecture
+ - Authentication & Authorization
+ - Data Protection
+ - Secure Configuration
+ - Vulnerability Management
+ - Incident Response
+ - Compliance Requirements
+
+- **Performance Guides**
+ - Performance Metrics
+ - Optimization Techniques
+ - Benchmarking Guidelines
+ - Resource Management
+ - Scaling Strategies
+ - Monitoring Setup
+ - Troubleshooting
+
+### 2. GitHub Content Scraping
+
+The module includes a sophisticated GitHub content scraper with the following capabilities:
+
+- **Rate Limiting**
+ - Configurable API call limits
+ - Automatic request throttling
+ - Concurrent request management
+
+- **Caching System**
+ - Configurable cache duration (TTL)
+ - Automatic cache invalidation
+ - Efficient storage of scraped content
+
+- **Content Extraction**
+ - Repository metadata
+ - README content
+ - File contents
+ - Repository topics
+ - Contributor information
+ - License information
+
+### 3. Content Enhancement
+
+- **Online Research Integration**
+ - Automatic topic research
+ - Related content discovery
+ - Industry trend analysis
+
+- **FAQ Generation**
+ - Automatic FAQ creation
+ - Common question identification
+ - Comprehensive answers
+
+- **Metadata Generation**
+ - SEO-optimized titles
+ - Meta descriptions
+ - Tags and categories
+ - Content structuring
+
+## Usage Examples
+
+### Basic Usage
+
+```python
+from lib.ai_writers.github_blogs import GitHubBlogGenerator
+
+# Initialize the generator
+generator = GitHubBlogGenerator()
+
+# Generate content for a GitHub repository
+content = await generator.generate_content(
+ github_url="https://github.com/owner/repo",
+ content_types=["getting_started", "technical_docs", "tutorials"]
+)
+
+# Save the generated content
+generator.save_content(content, "my_repository")
+```
+
+### Advanced Usage
+
+```python
+from lib.ai_writers.github_blogs import GitHubBlogGenerator
+
+# Initialize with custom settings
+generator = GitHubBlogGenerator(
+ cache_dir=".custom_cache",
+ ttl_hours=48
+)
+
+# Generate all content types
+content_types = [
+ "getting_started",
+ "technical_docs",
+ "tutorials",
+ "comparison",
+ "case_studies",
+ "contribution",
+ "security",
+ "performance"
+]
+
+# Generate content for multiple repositories
+urls = [
+ "https://github.com/owner/repo1",
+ "https://github.com/owner/repo2"
+]
+
+for url in urls:
+ content = await generator.generate_content(url, content_types)
+ generator.save_content(content, url.split("/")[-1])
+```
+
+## Configuration Options
+
+### GitHubBlogGenerator
+
+- `cache_dir` (str): Directory for caching scraped content (default: ".github_cache")
+- `ttl_hours` (int): Time-to-live for cached content in hours (default: 24)
+
+### Content Generation
+
+- `gpt_provider` (str): Choice of AI provider ("gemini" or "openai")
+- `content_types` (List[str]): Types of content to generate
+- `github_url` (str): URL of the GitHub repository
+
+## Output Format
+
+All generated content is saved in Markdown format with the following structure:
+
+```markdown
+# [Title]
+
+[Generated content based on content type]
+
+## Metadata
+- Title: [SEO-optimized title]
+- Description: [Meta description]
+- Tags: [Generated tags]
+- Categories: [Generated categories]
+```
+
+## Best Practices
+
+1. **Rate Limiting**
+ - Configure appropriate rate limits based on your GitHub API quota
+ - Use caching to minimize API calls
+ - Implement proper error handling for rate limit exceeded scenarios
+
+2. **Content Generation**
+ - Start with basic content types before generating advanced content
+ - Review generated content for accuracy and completeness
+ - Customize prompts for specific repository types
+
+3. **Caching**
+ - Set appropriate TTL based on repository update frequency
+ - Clear cache when repository content changes significantly
+ - Monitor cache size and performance
+
+4. **Error Handling**
+ - Implement proper error handling for API failures
+ - Log errors for debugging
+ - Provide fallback mechanisms for failed content generation
+
+## Dependencies
+
+- Python 3.8+
+- aiohttp
+- beautifulsoup4
+- loguru
+- pydantic
+- requests
+- pandas
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Commit your changes
+4. Push to the branch
+5. Create a Pull Request
+
+## License
+
+[Your License Here]
+
+## Support
+
+For support, please [create an issue](https://github.com/your-repo/issues) or contact the maintainers.
\ No newline at end of file
diff --git a/lib/ai_writers/github_blogs/github_getting_started.py b/lib/ai_writers/github_blogs/github_getting_started.py
index 17ecc201..81b247b5 100644
--- a/lib/ai_writers/github_blogs/github_getting_started.py
+++ b/lib/ai_writers/github_blogs/github_getting_started.py
@@ -1,39 +1,254 @@
+"""
+Enhanced GitHub Content Generator
+
+This module provides various content generation capabilities from GitHub repository data,
+including getting started guides, technical documentation, tutorials, and more.
+"""
+
import sys
-
-from .gpt_providers.openai_chat_completion import openai_chatgpt
-from .gpt_providers.gemini_pro_text import gemini_text_response
-
+from typing import Dict, List, Optional
from loguru import logger
+
+from lib.gpt_providers.text_generation.main_text_generation import llm_text_gen
+
logger.remove()
logger.add(sys.stdout,
- colorize=True,
- format="{level}|{file}:{line}:{function}| {message}"
- )
+ colorize=True,
+ format="{level}|{file}:{line}:{function}| {message}")
+def generate_technical_documentation(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate comprehensive technical documentation from repository data."""
+ prompt = f"""As an expert technical writer, create detailed technical documentation for the following GitHub repository:
+Repository Data:
+{repo_data}
-def github_readme_blog(readme_content):
- """ """
- prompt = f"""As an expert programmer and teacher, Write an original, detailed and step-by-step guide, from the provided Text below.
- Your guide should be original, engaging and help beginners get started easily.
- Write new example codes and detailed comments on how to run them. Include appropriate emoji where applicable.
- Include a referances section that links to more code examples.
- Your response MUST be a how-to blog in markdown format.
- Respond ONLY with your blog content.
+Please create a comprehensive technical documentation that includes:
+1. Architecture Overview
+2. Core Components
+3. Technical Specifications
+4. Integration Points
+5. Performance Considerations
+6. Security Features
+7. API Documentation (if applicable)
+8. Configuration Options
+9. Deployment Guidelines
+10. Troubleshooting Guide
- Text: '{readme_content}'
- """
- if 'gemini' in gpt_providers:
- try:
- response = gemini_text_response(prompt)
- return response
- except Exception as err:
- logger.error(f"Failed to get response from gemini: {err}")
- sys.exit(1)
- elif 'openai' in gpt_providers:
- try:
- logger.info("Calling OpenAI LLM.")
- response = openai_chatgpt(prompt)
- return response
- except Exception as err:
- SystemError(f"Failed to get response from Openai: {err}")
+Format the documentation in markdown with appropriate headers, code blocks, and diagrams.
+Include real-world examples and best practices.
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_getting_started_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate a beginner-friendly getting started guide."""
+ prompt = f"""As an expert programmer and teacher, create a comprehensive getting started guide for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create a step-by-step guide that includes:
+1. Introduction and Overview
+2. Prerequisites and Setup
+3. Installation Instructions
+4. Basic Usage Examples
+5. Common Use Cases
+6. Best Practices
+7. Next Steps and Resources
+
+Make the guide:
+- Beginner-friendly with clear explanations
+- Include practical examples with code snippets
+- Add emojis for better readability
+- Include troubleshooting tips
+- Provide links to additional resources
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_tutorial_series(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate a series of tutorials for different skill levels."""
+ prompt = f"""As an expert educator, create a series of tutorials for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create a structured tutorial series that includes:
+1. Beginner Tutorial
+ - Basic concepts
+ - Simple examples
+ - Step-by-step instructions
+
+2. Intermediate Tutorial
+ - Advanced features
+ - Real-world examples
+ - Best practices
+
+3. Advanced Tutorial
+ - Complex use cases
+ - Performance optimization
+ - Integration patterns
+
+Each tutorial should:
+- Be self-contained
+- Include practical examples
+- Have clear learning objectives
+- Include exercises and challenges
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_comparison_analysis(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate a comparison analysis with similar tools/frameworks."""
+ prompt = f"""As a technical analyst, create a comprehensive comparison analysis for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create a detailed comparison that includes:
+1. Feature Comparison
+2. Performance Analysis
+3. Use Case Suitability
+4. Community and Support
+5. Learning Curve
+6. Integration Capabilities
+7. Future Prospects
+
+Include:
+- Pros and Cons
+- Real-world use cases
+- Industry adoption
+- Community feedback
+- Future roadmap
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_case_studies(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate real-world case studies and success stories."""
+ prompt = f"""As a technical writer, create compelling case studies for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create detailed case studies that include:
+1. Problem Statement
+2. Solution Implementation
+3. Technical Challenges
+4. Results and Benefits
+5. Lessons Learned
+6. Future Improvements
+
+Make the case studies:
+- Based on real-world scenarios
+- Include technical details
+- Show measurable results
+- Provide actionable insights
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_contribution_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate a comprehensive contribution guide."""
+ prompt = f"""As an open-source maintainer, create a detailed contribution guide for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create a contribution guide that includes:
+1. Development Setup
+2. Code Style Guidelines
+3. Testing Requirements
+4. Documentation Standards
+5. Pull Request Process
+6. Review Guidelines
+7. Community Guidelines
+
+Make the guide:
+- Clear and concise
+- Include examples
+- Cover all contribution types
+- Provide templates
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_security_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate a security best practices guide."""
+ prompt = f"""As a security expert, create a comprehensive security guide for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create a security guide that includes:
+1. Security Architecture
+2. Authentication & Authorization
+3. Data Protection
+4. Secure Configuration
+5. Vulnerability Management
+6. Incident Response
+7. Compliance Requirements
+
+Make the guide:
+- Practical and actionable
+- Include security checklists
+- Provide code examples
+- Cover common vulnerabilities
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def generate_performance_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str:
+ """Generate a performance optimization guide."""
+ prompt = f"""As a performance optimization expert, create a detailed performance guide for the following GitHub repository:
+
+Repository Data:
+{repo_data}
+
+Create a performance guide that includes:
+1. Performance Metrics
+2. Optimization Techniques
+3. Benchmarking Guidelines
+4. Resource Management
+5. Scaling Strategies
+6. Monitoring Setup
+7. Troubleshooting
+
+Make the guide:
+- Data-driven
+- Include benchmarks
+- Provide optimization tips
+- Cover different scales
+"""
+ return _get_llm_response(prompt, gpt_provider)
+
+def _get_llm_response(prompt: str, gpt_provider: str) -> str:
+ """Get response from the specified LLM provider."""
+ system_prompt = """You are an expert technical writer and GitHub repository analyst with deep expertise in software development, documentation, and technical communication.
+
+ Your role is to create high-quality, accurate, and engaging content based on GitHub repository data. You should:
+
+ 1. **Technical Accuracy**
+ - Ensure all technical information is precise and up-to-date
+ - Verify code examples and configurations
+ - Cross-reference documentation and source code
+ - Maintain consistency with repository standards
+
+ 2. **Content Structure**
+ - Use clear hierarchical organization
+ - Include appropriate code blocks and examples
+ - Add relevant diagrams and visual aids
+ - Break complex topics into digestible sections
+
+ 3. **Writing Style**
+ - Maintain a professional yet approachable tone
+ - Use active voice and clear language
+ - Include practical examples and use cases
+ - Add relevant emojis for better readability
+
+ 4. **Best Practices**
+ - Follow industry-standard documentation practices
+ - Include troubleshooting sections
+ - Add performance considerations
+ - Address security implications
+"""
+ try:
+
+ llm_response = llm_text_gen(prompt, system_prompt=system_prompt)
+ except Exception as err:
+ logger.error(f"Failed to get response from {gpt_provider}: {err}")
+ raise
diff --git a/lib/ai_writers/github_blogs/main_getting_started_blogs.py b/lib/ai_writers/github_blogs/main_getting_started_blogs.py
index c397fefc..5d84a565 100644
--- a/lib/ai_writers/github_blogs/main_getting_started_blogs.py
+++ b/lib/ai_writers/github_blogs/main_getting_started_blogs.py
@@ -1,140 +1,157 @@
-""" Package for writing getting-started and how to guides. """
+"""
+Enhanced GitHub Blog Generator
+
+This module provides comprehensive content generation from GitHub repositories,
+including technical documentation, tutorials, case studies, and more.
+"""
import os
import sys
import datetime
import json
+from typing import Dict, List, Optional
+from pathlib import Path
from loguru import logger
logger.remove()
logger.add(sys.stdout,
- colorize=True,
- format="{level}|{file}:{line}:{function}| {message}"
- )
+ colorize=True,
+ format="{level}|{file}:{line}:{function}| {message}")
+from .scrape_github_readme import GitHubScraper, GitHubContent
from .scrape_github_readme import get_gh_details_vision, get_readme_content
from .scrape_github_readme import research_github_topics, check_if_already_written
-from .github_getting_started import github_readme_blog
-from .gpt_online_researcher import do_online_research
-from .faqs_generator_blog import generate_blog_faq
-from .get_blog_metadata import blog_metadata
-from .save_blog_to_file import save_blog_to_file
-from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
+from .github_getting_started import (
+ generate_technical_documentation,
+ generate_getting_started_guide,
+ generate_tutorial_series,
+ generate_comparison_analysis,
+ generate_case_studies,
+ generate_contribution_guide,
+ generate_security_guide,
+ generate_performance_guide
+)
-
-def blog_from_github(github_opts, flag):
- """ Module for writing getting started code examples from github. """
- if 'url' in flag:
- try:
- write_from_url(github_opts)
- except Exception as err:
- logger.error(f"Failed to write from github url: {github_opts}")
- sys.exit(1)
- elif 'csv' in flag:
- try:
- gh_urls = []
- with open(github_opts, 'r', encoding="utf-8") as file:
- # Read each line in the file
- for gh_url in file:
- gh_urls.append(gh_url.strip())
- except FileNotFoundError:
- logger.error(f"CSV File not found: {file_path}")
- except Exception as e:
- logger.error(f"CSV: An error occurred: {str(e)}")
-
- for gh_url in gh_urls:
- try:
- write_from_url(gh_url.strip())
- except Exception as err:
- logger.error(f"Failed to write blog from github: {err}")
-
-
-
-def write_from_url(gh_url):
- # String to store the blog content.
- howto_blog = ''
- # The url was not found in already_written data.
- if not check_if_already_written(gh_url):
- logger.info(f"Writing getting started from url: {gh_url}")
- else:
- logger.error(f"Skipping, already written on url: {gh_url}")
- return
-
- # Direct link to the raw content of README file
- # fixme: Remove the hardcoding, need add another option OR in config ?
- image_dir = os.path.join(os.getcwd(), "blog_images")
- generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
- generated_image_filepath = os.path.join(image_dir, generated_image_name)
- try:
- logger.info(f"Getting github repo details from vision model: {generated_image_filepath}")
- gh_json = get_gh_details_vision(gh_url, generated_image_filepath)
- except Exception as err:
- logger.error(f"Failed to get gemini vision details from GH repo image: {err}")
- sys.exit(1)
- howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n"
- howto_blog += f"Forks:{gh_json.get('forks')}\n"
- howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n"
-
- raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:])
- if gh_json.get('branch_name'):
- raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md"
- else:
- raw_readme_url = raw_readme_url_base + f"/main/" + "README.md"
- logger.info(f"Using this url to fetch the README file: {raw_readme_url}")
-
- try:
- # Get and print the main content
- readme_content = get_readme_content(raw_readme_url)
- except Exception as err:
- logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}")
- # If the readme is still None, try with master branch.
- if not readme_content:
- raw_readme_url = raw_readme_url_base + f"/master/" + "README.md"
- logger.warning(f"Trying with master branch: {raw_readme_url}")
- readme_content = get_readme_content(raw_readme_url)
- if not readme_content:
- logger.error(f"Still failed to get the README: {readme_content}")
- sys.exit(1)
+class GitHubBlogGenerator:
+ """Generator for various types of GitHub-related content."""
- # Create a getting-started blog, adapted from the GH url README.
- howto_blog += github_readme_blog(readme_content, "gemini")
+ def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24):
+ """Initialize the blog generator."""
+ self.cache_dir = Path(cache_dir)
+ self.scraper = GitHubScraper(cache_dir, ttl_hours)
+ self.output_dir = Path("generated_content")
+ self.output_dir.mkdir(exist_ok=True)
+
+ async def generate_content(self, github_url: str, content_types: List[str] = None) -> Dict[str, str]:
+ """Generate various types of content from a GitHub repository."""
+ if content_types is None:
+ content_types = ["getting_started", "technical_docs", "tutorials"]
+
+ try:
+ # Scrape GitHub content
+ repo_content = await self.scraper.scrape_github_content(github_url)
+
+ # Generate different types of content
+ generated_content = {}
+
+ for content_type in content_types:
+ if content_type == "getting_started":
+ content = generate_getting_started_guide(repo_content.dict())
+ elif content_type == "technical_docs":
+ content = generate_technical_documentation(repo_content.dict())
+ elif content_type == "tutorials":
+ content = generate_tutorial_series(repo_content.dict())
+ elif content_type == "comparison":
+ content = generate_comparison_analysis(repo_content.dict())
+ elif content_type == "case_studies":
+ content = generate_case_studies(repo_content.dict())
+ elif content_type == "contribution":
+ content = generate_contribution_guide(repo_content.dict())
+ elif content_type == "security":
+ content = generate_security_guide(repo_content.dict())
+ elif content_type == "performance":
+ content = generate_performance_guide(repo_content.dict())
+ else:
+ logger.warning(f"Unknown content type: {content_type}")
+ continue
+
+ generated_content[content_type] = content
+
+ # Generate FAQs from online research
+ try:
+ research_report = do_online_research(repo_content.title, "gemini", github_url)
+ faqs = generate_blog_faq(research_report, "gemini")
+ generated_content["faqs"] = faqs
+ except Exception as err:
+ logger.error(f"Failed to generate FAQs: {err}")
+
+ return generated_content
+
+ except Exception as err:
+ logger.error(f"Failed to generate content: {err}")
+ raise
+
+ def save_content(self, content: Dict[str, str], base_filename: str):
+ """Save generated content to files."""
+ try:
+ for content_type, content_text in content.items():
+ # Generate metadata for each content type
+ title, meta_desc, tags, categories = blog_metadata(content_text, "gemini")
+
+ # Create filename with content type
+ filename = f"{base_filename}_{content_type}.md"
+
+ # Save content to file
+ save_blog_to_file(
+ content_text,
+ title,
+ meta_desc,
+ tags,
+ categories,
+ None # No image path for now
+ )
+
+ logger.info(f"Saved {content_type} content to {filename}")
+
+ except Exception as err:
+ logger.error(f"Failed to save content: {err}")
+ raise
- # Do online research for faqs on the github url.
- try:
- # Repo names are misnomers for others search, include its decription too.
- # Which, skews the result favourably towards its home/paid pages.
- #online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about')
- online_query = f"{''.join(gh_url.split('/')[-1:])} "
- logger.info("Do web research with Tavily & Metaphor AI.")
- research_report = do_online_research(online_query, "gemini", gh_url)
- except Exception as err:
- logger.error(f"failed to do online research: {err}")
+async def main():
+ """Example usage of the GitHub blog generator."""
+ generator = GitHubBlogGenerator()
+
+ # Example GitHub URLs
+ urls = [
+ "https://github.com/owner/repo",
+ "https://github.com/owner/another-repo"
+ ]
+
+ content_types = [
+ "getting_started",
+ "technical_docs",
+ "tutorials",
+ "comparison",
+ "case_studies",
+ "contribution",
+ "security",
+ "performance"
+ ]
+
+ for url in urls:
+ try:
+ # Generate content
+ content = await generator.generate_content(url, content_types)
+
+ # Create base filename from URL
+ base_filename = url.split("/")[-1]
+
+ # Save content
+ generator.save_content(content, base_filename)
+
+ except Exception as e:
+ logger.error(f"Error processing {url}: {e}")
- # Generate FAQs from the online research report.
- try:
- blog_faqs = generate_blog_faq(research_report, "gemini")
- howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs
- except Exception as err:
- logger.error(f"Failed to generate FAQs from web research_report: {err}")
-
- logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n")
-
- try:
- blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini")
- except Exception as err:
- logger.error(f"Failed to get blog metadata: {err}")
- raise err
-
- try:
- save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\
- blog_categories, generated_image_filepath)
- except Exception as err:
- logger.error(f"Failed to save blog to a file: {err}")
- sys.exit(1)
-
- try:
- append_id_to_file(gh_url, "papers_already_written_on.txt")
- except Exception as err:
- logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
- raise err
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/lib/ai_writers/github_blogs/scrape_github_readme.py b/lib/ai_writers/github_blogs/scrape_github_readme.py
index 3e03958c..98efd98a 100644
--- a/lib/ai_writers/github_blogs/scrape_github_readme.py
+++ b/lib/ai_writers/github_blogs/scrape_github_readme.py
@@ -1,292 +1,422 @@
+"""
+Enhanced GitHub Content Scraper with Rate Limiting and Caching
+
+This module provides functionality to scrape GitHub repositories, READMEs, and code files
+for content marketing purposes. It includes async support, rate limiting, caching,
+and comprehensive metadata collection.
+"""
+
import os
import sys
-import datetime
-import pandas as pd
-
import json
-import requests
+import asyncio
+import aiohttp
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Union
+from urllib.parse import urljoin, urlparse
+import pandas as pd
from bs4 import BeautifulSoup
from loguru import logger
+import requests
+from pydantic import BaseModel, Field
+import time
+import pickle
+from pathlib import Path
+
+# Configure logging
logger.remove()
logger.add(sys.stdout,
colorize=True,
- format="{level}|{file}:{line}:{function}| {message}"
- )
+ format="{level}|{file}:{line}:{function}| {message}")
-
-from .take_url_screenshot import take_screenshot
-from .gpt_providers.gemini_image_details import gemini_get_img_info
-
-
-
-def get_readme_content(url):
- try:
- # Fetch the README content directly from the URL
- response = requests.get(url)
- print(response.status_code)
- if response.status_code == 200:
- logger.debug("Successfully fetched the README.md")
- readme_content = response.text
- else:
- readme_content = None
- return readme_content
- except Exception as err:
- logger.error(f"Failed to fetch raw readme from {url}: {err}: {response.status_code}")
- sys.exit(1)
-
-
-def get_gh_repo_metadata(github_url):
- """ Function to get the repo details like stars, commits, forks etc """
- logger.info("Scraping github with BS4 and requests.")
- # download the target page
- page = requests.get(github_url)
- # parse the HTML document returned by the server
- soup = BeautifulSoup(page.text, 'html.parser')
-
- # initialize the object that will contain the scraped data
- repo = {}
-
- # repo scraping logic
- name_html_element = soup.select_one('[itemprop="name"]')
- name = name_html_element.get_text().strip()
-
- git_branch_icon_html_element = soup.select_one('.octicon-git-branch')
- main_branch_html_element = git_branch_icon_html_element.find_next_sibling('span')
- main_branch = main_branch_html_element.get_text().strip()
-
- # scrape the repo history data
- boxheader_html_element = soup.select_one('.Box .Box-header')
-
- # scrape the repo details in the right box
- bordergrid_html_element = soup.select_one('.BorderGrid')
-
- about_html_element = bordergrid_html_element.select_one('h2')
- description_html_element = about_html_element.find_next_sibling('p')
- description = description_html_element.get_text().strip()
-
- star_icon_html_element = bordergrid_html_element.select_one('.octicon-star')
- stars_html_element = star_icon_html_element.find_next_sibling('strong')
- stars = stars_html_element.get_text().strip().replace(',', '')
-
- eye_icon_html_element = bordergrid_html_element.select_one('.octicon-eye')
- watchers_html_element = eye_icon_html_element.find_next_sibling('strong')
- watchers = watchers_html_element.get_text().strip().replace(',', '')
-
- fork_icon_html_element = bordergrid_html_element.select_one('.octicon-repo-forked')
- forks_html_element = fork_icon_html_element.find_next_sibling('strong')
- forks = forks_html_element.get_text().strip().replace(',', '')
-
- # Find the div with class "f6" containing topic links
- topic_div = soup.find('div', class_='f6')
- if topic_div:
- # Find all the topic links within the div
- topic_links = topic_div.find_all('a', class_='topic-tag-link')
- # Extract and print the topics
- repo['topics'] = [link.text.strip() for link in topic_links]
-
- # FIXME: Unable to scrape branch name.
- repo['branch_name'] = None
- # store the scraped data
- repo['name'] = name
- repo['about'] = description
- repo['stars'] = stars
- repo['watchers'] = watchers
- repo['forks'] = forks
- #repo['readme'] = readme
- logger.info(f"Github Repo Details: {repo}")
- return(repo)
-
-
-def get_gh_details_vision(github_url, generated_image_filepath):
- """ Take a screenshot of the url and feed to vision models for scraping details. """
- logger.info(f"Take screenshot and pass it to gemini for repo details of {github_url}")
-
- generated_image_filepath = take_screenshot(github_url, generated_image_filepath)
- prompt = """From the given image of a github page, find out the number of stars, about, forks, last commit days, link url, topics and branch name. Return the result as json."""
+class RateLimiter:
+ """Rate limiter for GitHub API requests."""
- try:
- gh_details = gemini_get_img_info(prompt, generated_image_filepath)
- logger.info(f"Github Repo details, from vision model: {gh_details}")
- #gh_details = get_gh_repo_metadata(github_url)
- except Exception as err:
- logger.error(f"Failed to get gh images details: {err}")
- gh_details = get_gh_repo_metadata(github_url)
- return gh_details
-
- # Convert string to dictionary Split the string into lines
- lines = gh_details.split('\n')
- # Remove the first and last line
- modified_lines = lines[1:-1]
- # Join the modified lines back into a string
- gh_details = '\n'.join(modified_lines)
- gh_details = json.loads(gh_details)
-
- return(gh_details)
-
-
-def research_github_topics(topics):
- """ Scrape github topics of interest for top repos to write on """
- # https://www.kaggle.com/code/subhaskumarray/scraping-github-topics-with-their-repositories
- # We are going to scrape https://github.com/topics
- # We will get a list of topics. For each topic, we will extract topic name, topic description and topic url.
- # For each topic, we will get top 30 repositories with repo name, repo username, stars and repo url.
- # Finally we are going to create csv file for each topic with respective repo details.
-
- #github_topics = "https://github.com/topics/"
- #response = requests.get(github_topics)
- #if response.status_code != 200:
- # logger.error(f'There is something wrong with {url}')
- #response_contents = response.text
- # Now we will parse the contents using BeautifulSoup:
- #parsed_contents = BeautifulSoup(response_contents,'html.parser')
- #logger.info("Get all topics, Titles and their urls from github.")
- #topic_titles = get_topic_titles(parsed_contents)
- #topic_desc = get_topic_desc(parsed_contents)
- #topic_urls = get_topic_url(parsed_contents)
- #topic_df = pd.DataFrame(list(zip(topic_titles, topic_desc,topic_urls)),\
- # columns =['title', 'description', 'url'])
- #logger.info(f"Scraped data from github: {topic_df}")
-
- gh_topics = ['ai', 'ai-tools', 'ai-assistant', 'ai-agents-framework', 'llm', 'multi-agent', 'fine-tuning', 'rag', 'generative', 'prompt-engineering', 'generative-ai', 'text-to-image-generation', 'llm-ops', 'retrieval-augmented-generation', 'langchain', 'gemini-api', 'vertex-ai', 'huggingface', 'auto-gpt', 'llmops', 'ai-toolkit', 'chatbot', 'chatgpt', 'code-assistant', 'text-to-video', 'llms', 'gpt-4']
-
- repo_info_dict = {
- 'username':[],
- 'repo_name': [],
- 'stars': [],
- 'repo_url': []
- }
- for agh_topic in gh_topics:
- topic_url = f"https://github.com/topics/{agh_topic}"
- first_topic_repo_page = download_repo_page(topic_url)
- logger.info(f"Get details on github topic: {topic_url}")
- repo_tags = first_topic_repo_page.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})
- star_tags = first_topic_repo_page.find_all('span', {'class': 'Counter js-social-count'})
+ def __init__(self, calls_per_minute: int = 30):
+ self.calls_per_minute = calls_per_minute
+ self.interval = 60 / calls_per_minute # seconds between calls
+ self.last_call_time = 0
+ self.lock = asyncio.Lock()
- for i in range(len(repo_tags)):
- repo_details = get_repo_info(repo_tags[i], star_tags[i])
+ async def acquire(self):
+ """Acquire rate limit token."""
+ async with self.lock:
+ current_time = time.time()
+ time_since_last_call = current_time - self.last_call_time
+
+ if time_since_last_call < self.interval:
+ await asyncio.sleep(self.interval - time_since_last_call)
+
+ self.last_call_time = time.time()
+
+class Cache:
+ """Cache for GitHub content."""
+
+ def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24):
+ self.cache_dir = Path(cache_dir)
+ self.ttl = timedelta(hours=ttl_hours)
+ self.cache_dir.mkdir(exist_ok=True)
+
+ def _get_cache_path(self, key: str) -> Path:
+ """Get cache file path for a key."""
+ return self.cache_dir / f"{hash(key)}.cache"
+
+ def get(self, key: str) -> Optional[Dict]:
+ """Get cached value for key."""
+ cache_path = self._get_cache_path(key)
- # Check if the repo URL is not already present in the dictionary
- if repo_details[3] not in repo_info_dict['repo_url']:
- # Store repos with more than 5000 stars.
- if repo_details[2] > 5000:
- repo_info_dict['username'].append(repo_details[0])
- repo_info_dict['repo_name'].append(repo_details[1])
- repo_info_dict['stars'].append(repo_details[2])
- repo_info_dict['repo_url'].append(repo_details[3])
-
- # Create a DataFrame from repo_info_dict
- df_repo_info = pd.DataFrame(repo_info_dict['repo_url'])
-
- # Check if the file already exists
- csv_filename = 'github_url_to_write.csv'
- if os.path.isfile(csv_filename):
- # Append to the existing file
- df_repo_info.to_csv(csv_filename, mode='a', header=False, index=False)
- logger.info(f"Data appended to existing file: {csv_filename}")
- else:
- # Create a new file
- df_repo_info.to_csv(csv_filename, index=False)
-
-
-def get_topic_titles(parsed_content):
- try:
- selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
- topic_title_tags = parsed_content.find_all('p',{'class':selected_class})
- # We can make a list of topics
- topic_titles = []
- for tags in topic_title_tags:
- topic_titles.append(tags.text)
- return topic_titles
- except Exception as err:
- logger.error(f"Failed to get github topic titles: {err}")
-
-
-def get_topic_desc(parsed_contents):
- try:
- desc_selector = 'f5 color-fg-muted mb-0 mt-1'
- topic_desc_tags = parsed_contents.find_all('p',{'class': desc_selector})
- print(f"{topic_desc_tags}")
- topic_desc = []
- for desc in topic_desc_tags:
- print("dsfsfs")
- topic_desc.append(desc.text.strip()) # strip() is used for trimming all extra spaces in description.
- return topic_desc
- except Exception as err:
- logger.error(f"Failed to get github topic desc: {err}")
-
-
-def get_topic_url(parsed_contents):
- try:
- topic_link_tag = parsed_contents.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'})
- topic_urls = []
- base_url = 'http://github.com'
- for urls in topic_link_tag:
- topic_urls.append(base_url + urls['href'])
- return topic_urls
- except Exception as err:
- logger.error(f"Failed to get github topic urls: {err}")
-
-
-def download_repo_page(topic_url):
- response = requests.get(topic_url)
- if response.status_code != 200:
- print('There is some error in {}'.format(topic_url))
- response_contents = response.text
+ if not cache_path.exists():
+ return None
+
+ try:
+ with open(cache_path, 'rb') as f:
+ data = pickle.load(f)
+ if datetime.now() - data['timestamp'] > self.ttl:
+ cache_path.unlink()
+ return None
+ return data['value']
+ except Exception as e:
+ logger.warning(f"Cache read error for {key}: {e}")
+ return None
- parsed_contents = BeautifulSoup(response_contents,'html.parser')
- return parsed_contents
+ def set(self, key: str, value: Dict):
+ """Set cache value for key."""
+ cache_path = self._get_cache_path(key)
+
+ try:
+ with open(cache_path, 'wb') as f:
+ pickle.dump({
+ 'timestamp': datetime.now(),
+ 'value': value
+ }, f)
+ except Exception as e:
+ logger.warning(f"Cache write error for {key}: {e}")
+class GitHubContent(BaseModel):
+ """Model for GitHub content analysis."""
+ title: str = Field("", description="Title of the content")
+ description: str = Field("", description="Description of the content")
+ content: str = Field("", description="Main content")
+ language: str = Field("", description="Programming language")
+ stars: int = Field(0, description="Number of stars")
+ forks: int = Field(0, description="Number of forks")
+ watchers: int = Field(0, description="Number of watchers")
+ last_updated: str = Field("", description="Last update date")
+ topics: List[str] = Field([], description="Repository topics")
+ contributors: List[str] = Field([], description="Contributor usernames")
+ readme_url: str = Field("", description="URL of the README")
+ raw_content_url: str = Field("", description="URL for raw content")
+ license: str = Field("", description="Repository license")
+ dependencies: List[str] = Field([], description="Project dependencies")
+ metadata: Dict = Field({}, description="Additional metadata")
-def get_repo_info(repo_tags,star_tags):
- # returns all info for a repo
- a_tags = repo_tags.find_all('a')
- username = a_tags[0].text.strip()
- repo_name = a_tags[1].text.strip()
- base_url = 'http://github.com/'
- repo_url = base_url + a_tags[1]['href'].strip()
+class GitHubScraper:
+ """Service for scraping GitHub content with rate limiting and caching."""
- # Defining a function so that it will convert our star count to integer
- def star_counts_converter(stars):
- stars = stars.strip()
- if stars[-1] == 'k':
- return int(float(stars[:-1]) * 1000)
- return int(stars)
- star_counts = star_counts_converter(star_tags.text.strip())
- return username,repo_name,star_counts,repo_url
+ def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24, calls_per_minute: int = 30):
+ """Initialize the scraper service."""
+ self.session = None
+ self.headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+ 'Accept': 'application/vnd.github.v3+json'
+ }
+ self.rate_limiter = RateLimiter(calls_per_minute)
+ self.cache = Cache(cache_dir, ttl_hours)
+
+ async def __aenter__(self):
+ """Create aiohttp session when entering context."""
+ self.session = aiohttp.ClientSession(headers=self.headers)
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Close aiohttp session when exiting context."""
+ if self.session:
+ await self.session.close()
+
+ async def fetch_url(self, url: str, use_cache: bool = True) -> str:
+ """Fetch URL content asynchronously with rate limiting and caching."""
+ if use_cache:
+ cached_content = self.cache.get(url)
+ if cached_content:
+ logger.debug(f"Cache hit for {url}")
+ return cached_content
+
+ await self.rate_limiter.acquire()
+
+ try:
+ async with self.session.get(url) as response:
+ if response.status == 200:
+ content = await response.text()
+ if use_cache:
+ self.cache.set(url, content)
+ return content
+ else:
+ error_msg = f"Failed to fetch URL: Status code {response.status}"
+ logger.error(error_msg)
+ raise Exception(error_msg)
+ except Exception as e:
+ logger.error(f"Error fetching URL {url}: {e}")
+ raise
+
+ def parse_github_url(self, url: str) -> Dict[str, str]:
+ """Parse GitHub URL to extract repository information."""
+ parsed = urlparse(url)
+ path_parts = parsed.path.strip('/').split('/')
+
+ if len(path_parts) < 2:
+ raise ValueError("Invalid GitHub URL format")
+
+ return {
+ 'owner': path_parts[0],
+ 'repo': path_parts[1],
+ 'branch': path_parts[3] if len(path_parts) > 3 else 'main',
+ 'path': '/'.join(path_parts[4:]) if len(path_parts) > 4 else ''
+ }
+
+ async def get_repo_metadata(self, owner: str, repo: str) -> Dict:
+ """Get repository metadata from GitHub API with caching."""
+ cache_key = f"metadata_{owner}_{repo}"
+ cached_metadata = self.cache.get(cache_key)
+ if cached_metadata:
+ return cached_metadata
+
+ await self.rate_limiter.acquire()
+
+ api_url = f"https://api.github.com/repos/{owner}/{repo}"
+ try:
+ async with self.session.get(api_url) as response:
+ if response.status == 200:
+ metadata = await response.json()
+ self.cache.set(cache_key, metadata)
+ return metadata
+ else:
+ logger.error(f"Failed to fetch repo metadata: {response.status}")
+ return {}
+ except Exception as e:
+ logger.error(f"Error fetching repo metadata: {e}")
+ return {}
+
+ async def get_readme_content(self, owner: str, repo: str, branch: str = 'main') -> Dict:
+ """Get README content from GitHub with caching."""
+ cache_key = f"readme_{owner}_{repo}_{branch}"
+ cached_content = self.cache.get(cache_key)
+ if cached_content:
+ return cached_content
+
+ try:
+ # Try to get README from API first
+ await self.rate_limiter.acquire()
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/readme"
+ async with self.session.get(api_url) as response:
+ if response.status == 200:
+ readme_data = await response.json()
+ content = {
+ 'content': readme_data.get('content', ''),
+ 'encoding': readme_data.get('encoding', 'base64'),
+ 'url': readme_data.get('html_url', '')
+ }
+ self.cache.set(cache_key, content)
+ return content
+
+ # Fallback to scraping if API fails
+ readme_url = f"https://github.com/{owner}/{repo}/blob/{branch}/README.md"
+ html_content = await self.fetch_url(readme_url, use_cache=True)
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ # Find the README content
+ readme_content = soup.find('div', {'class': 'markdown-body'})
+ if readme_content:
+ content = {
+ 'content': readme_content.get_text(),
+ 'encoding': 'text',
+ 'url': readme_url
+ }
+ self.cache.set(cache_key, content)
+ return content
+
+ return {}
+ except Exception as e:
+ logger.error(f"Error fetching README: {e}")
+ return {}
+
+ async def get_file_content(self, owner: str, repo: str, path: str, branch: str = 'main') -> Dict:
+ """Get content of a specific file from GitHub with caching."""
+ cache_key = f"file_{owner}_{repo}_{path}_{branch}"
+ cached_content = self.cache.get(cache_key)
+ if cached_content:
+ return cached_content
+
+ try:
+ # Try to get file content from API first
+ await self.rate_limiter.acquire()
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
+ async with self.session.get(api_url) as response:
+ if response.status == 200:
+ file_data = await response.json()
+ content = {
+ 'content': file_data.get('content', ''),
+ 'encoding': file_data.get('encoding', 'base64'),
+ 'url': file_data.get('html_url', '')
+ }
+ self.cache.set(cache_key, content)
+ return content
+
+ # Fallback to scraping if API fails
+ file_url = f"https://github.com/{owner}/{repo}/blob/{branch}/{path}"
+ html_content = await self.fetch_url(file_url, use_cache=True)
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ # Find the file content
+ file_content = soup.find('div', {'class': 'file-content'})
+ if file_content:
+ content = {
+ 'content': file_content.get_text(),
+ 'encoding': 'text',
+ 'url': file_url
+ }
+ self.cache.set(cache_key, content)
+ return content
+
+ return {}
+ except Exception as e:
+ logger.error(f"Error fetching file content: {e}")
+ return {}
+
+ async def get_repo_topics(self, owner: str, repo: str) -> List[str]:
+ """Get repository topics with caching."""
+ cache_key = f"topics_{owner}_{repo}"
+ cached_topics = self.cache.get(cache_key)
+ if cached_topics:
+ return cached_topics
+
+ try:
+ await self.rate_limiter.acquire()
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/topics"
+ async with self.session.get(api_url, headers={'Accept': 'application/vnd.github.mercy-preview+json'}) as response:
+ if response.status == 200:
+ data = await response.json()
+ topics = data.get('names', [])
+ self.cache.set(cache_key, topics)
+ return topics
+ return []
+ except Exception as e:
+ logger.error(f"Error fetching topics: {e}")
+ return []
+
+ async def get_contributors(self, owner: str, repo: str) -> List[str]:
+ """Get repository contributors with caching."""
+ cache_key = f"contributors_{owner}_{repo}"
+ cached_contributors = self.cache.get(cache_key)
+ if cached_contributors:
+ return cached_contributors
+
+ try:
+ await self.rate_limiter.acquire()
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
+ async with self.session.get(api_url) as response:
+ if response.status == 200:
+ contributors = await response.json()
+ contributor_list = [contributor['login'] for contributor in contributors]
+ self.cache.set(cache_key, contributor_list)
+ return contributor_list
+ return []
+ except Exception as e:
+ logger.error(f"Error fetching contributors: {e}")
+ return []
+
+ async def scrape_github_content(self, url: str) -> GitHubContent:
+ """Main function to scrape GitHub content with caching."""
+ cache_key = f"content_{url}"
+ cached_content = self.cache.get(cache_key)
+ if cached_content:
+ return GitHubContent(**cached_content)
+
+ try:
+ # Parse the GitHub URL
+ repo_info = self.parse_github_url(url)
+
+ # Get repository metadata
+ metadata = await self.get_repo_metadata(repo_info['owner'], repo_info['repo'])
+
+ # Get content based on URL type
+ if not repo_info['path'] or repo_info['path'].lower() == 'readme.md':
+ content_data = await self.get_readme_content(
+ repo_info['owner'],
+ repo_info['repo'],
+ repo_info['branch']
+ )
+ else:
+ content_data = await self.get_file_content(
+ repo_info['owner'],
+ repo_info['repo'],
+ repo_info['path'],
+ repo_info['branch']
+ )
+
+ # Get additional metadata
+ topics = await self.get_repo_topics(repo_info['owner'], repo_info['repo'])
+ contributors = await self.get_contributors(repo_info['owner'], repo_info['repo'])
+
+ # Create GitHubContent object
+ content = GitHubContent(
+ title=metadata.get('name', ''),
+ description=metadata.get('description', ''),
+ content=content_data.get('content', ''),
+ language=metadata.get('language', ''),
+ stars=metadata.get('stargazers_count', 0),
+ forks=metadata.get('forks_count', 0),
+ watchers=metadata.get('watchers_count', 0),
+ last_updated=metadata.get('updated_at', ''),
+ topics=topics,
+ contributors=contributors,
+ readme_url=content_data.get('url', ''),
+ raw_content_url=metadata.get('html_url', ''),
+ license=metadata.get('license', {}).get('name', ''),
+ metadata={
+ 'size': metadata.get('size', 0),
+ 'open_issues': metadata.get('open_issues_count', 0),
+ 'default_branch': metadata.get('default_branch', 'main'),
+ 'created_at': metadata.get('created_at', ''),
+ 'pushed_at': metadata.get('pushed_at', '')
+ }
+ )
+
+ # Cache the complete content
+ self.cache.set(cache_key, content.dict())
+
+ return content
+
+ except Exception as e:
+ logger.error(f"Error scraping GitHub content: {e}")
+ raise
+
+async def main():
+ """Example usage of the GitHub scraper with rate limiting and caching."""
+ scraper = GitHubScraper(
+ cache_dir=".github_cache",
+ ttl_hours=24,
+ calls_per_minute=30
+ )
+
+ async with scraper:
+ # Example URLs
+ urls = [
+ "https://github.com/owner/repo",
+ "https://github.com/owner/repo/blob/main/README.md",
+ "https://github.com/owner/repo/blob/main/src/main.py"
+ ]
+
+ for url in urls:
+ try:
+ content = await scraper.scrape_github_content(url)
+ print(f"Scraped content from {url}:")
+ print(json.dumps(content.dict(), indent=2))
+ except Exception as e:
+ print(f"Error scraping {url}: {e}")
-def save_to_csv(topic_url,topic_name):
- file_name = topic_name + '.csv'
- if os.path.exists(file_name):
- logger.debug(f"The file {file_name} already exists. Skipping.")
- topics_df = topic_repo_details(topic_url)
- topics_df.to_csv(file_name,index=None)
- logger.info(f"Successfully scraped topic {topic_name}")
-
-
-def check_if_already_written(github_url, file_path='papers_already_written_on.txt'):
- """
- Check if a GitHub URL is an exact match in each line of a file.
-
- Args:
- github_url (str): GitHub URL string to check.
- file_path (str): Path to the file containing lines to check against. Default is 'papers_already_written_on.txt'.
-
- Returns:
- bool: True if an exact match is found, False otherwise.
- """
- try:
- with open(file_path, 'r', encoding="utf-8") as file:
- # Read each line in the file
- for line in file:
- # Check for an exact match
- if github_url.strip() == line.strip():
- return True
- except FileNotFoundError:
- print(f"File not found: {file_path}")
- except Exception as e:
- print(f"An error occurred: {str(e)}")
- return False
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/lib/blog_sections/faqs_generator_blog.py b/lib/blog_sections/faqs_generator_blog.py
deleted file mode 100644
index a0c7b87d..00000000
--- a/lib/blog_sections/faqs_generator_blog.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import sys
-
-from .gpt_providers.openai_chat_completion import openai_chatgpt
-from .gpt_providers.gemini_pro_text import gemini_text_response
-
-from loguru import logger
-logger.remove()
-logger.add(sys.stdout,
- colorize=True,
- format="{level}|{file}:{line}:{function}| {message}"
- )
-
-
-def generate_blog_faq(blog_article, gpt_providers="openai"):
- """
- Given a blog title generate an outline for it
- """
- logger.info("Generating blog FAQs.")
- prompt = f"""As an expert writer, I will provide you with blog content below.
- Your task is to write 5 FAQs based on the given blog content.
- Always, write fact based answers. Use emojis where applicable.
- You must reply in MARKDOWN format.
- blog content: '{blog_article}' """
-
- if 'gemini' in gpt_providers:
- try:
- response = gemini_text_response(prompt)
- return response
- except Exception as err:
- logger.error(f"Failed to get response from gemini: {err}")
- elif 'openai' in gpt_providers:
- try:
- logger.info("Calling OpenAI LLM.")
- response = openai_chatgpt(prompt)
- return response
- except Exception as err:
- SystemError(f"Failed to get response from Openai: {err}")