From 26b02b97190b3053529d6acee872014a2a29b18e Mon Sep 17 00:00:00 2001 From: ajaysi Date: Sun, 4 May 2025 17:04:44 +0530 Subject: [PATCH] AI FAQ Generator & github blogs --- lib/ai_writers/ai_blog_faqs_writer/README.md | 192 +++++ .../faqs_generator_blog.py | 386 ++++++++++ lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py | 177 +++++ lib/ai_writers/ai_writer_dashboard.py | 10 +- lib/ai_writers/github_blogs/README.md | 259 +++++++ .../github_blogs/github_getting_started.py | 277 +++++++- .../main_getting_started_blogs.py | 263 +++---- .../github_blogs/scrape_github_readme.py | 672 +++++++++++------- lib/blog_sections/faqs_generator_blog.py | 37 - 9 files changed, 1810 insertions(+), 463 deletions(-) create mode 100644 lib/ai_writers/ai_blog_faqs_writer/README.md create mode 100644 lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py create mode 100644 lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py create mode 100644 lib/ai_writers/github_blogs/README.md delete mode 100644 lib/blog_sections/faqs_generator_blog.py diff --git a/lib/ai_writers/ai_blog_faqs_writer/README.md b/lib/ai_writers/ai_blog_faqs_writer/README.md new file mode 100644 index 00000000..6023e0c7 --- /dev/null +++ b/lib/ai_writers/ai_blog_faqs_writer/README.md @@ -0,0 +1,192 @@ +# AI-Powered FAQ Generator + +A sophisticated FAQ generation system that creates comprehensive, well-researched FAQs from various content sources. This tool leverages AI to analyze content, conduct web research, and generate detailed FAQs with customizable options. + +## Features + +### Content Processing +- **Multiple Input Sources** + - Direct text input + - File uploads (DOCX, TXT) + - URL content extraction + - Support for any content type (general, technical, educational, etc.) + +### Research Capabilities +- **Multi-level Search Depth** + - **Basic**: Google Search for quick, general information + - **Comprehensive**: Tavily AI for detailed, in-depth research + - **Expert**: Metaphor AI for specialized, expert-level content + +### Customization Options +- **Target Audience** + - Beginner + - Intermediate + - Expert + +- **FAQ Style** + - Technical + - Conversational + - Professional + +- **Advanced Features** + - Emoji inclusion + - Code example generation + - Reference integration + - Customizable time range for research + - Multi-language support + +### Output Formats +- Interactive preview +- Markdown +- HTML +- JSON + +## Installation + +1. Clone the repository +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Usage + +### Basic Usage +```python +from lib.ai_writers.ai_blog_faqs_writer.faqs_generator_blog import FAQGenerator, FAQConfig + +# Initialize with default configuration +generator = FAQGenerator() + +# Generate FAQs from content +faqs = await generator.generate_faqs("Your content here") +``` + +### Advanced Configuration +```python +from lib.ai_writers.ai_blog_faqs_writer.faqs_generator_blog import ( + FAQGenerator, FAQConfig, TargetAudience, FAQStyle, SearchDepth +) + +# Custom configuration +config = FAQConfig( + num_faqs=10, + target_audience=TargetAudience.INTERMEDIATE, + faq_style=FAQStyle.TECHNICAL, + include_emojis=True, + include_code_examples=True, + include_references=True, + search_depth=SearchDepth.COMPREHENSIVE, + time_range="last_6_months", + language="English" +) + +generator = FAQGenerator(config) +``` + +### Web Interface +Run the Streamlit interface: +```bash +streamlit run lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py +``` + +## Research Process + +1. **Content Analysis** + - Identifies key topics and concepts + - Extracts potential questions + - Determines research requirements + +2. **Web Research** + - Selects appropriate search function based on depth + - Gathers relevant information + - Validates and cross-references data + +3. **FAQ Generation** + - Creates comprehensive questions + - Provides detailed answers + - Includes code examples (if applicable) + - Adds references and citations + +## Output Structure + +Each FAQ item includes: +- Question +- Detailed answer +- Category +- Code example (if applicable) +- References +- Confidence score +- Last updated timestamp + +## Configuration Options + +### FAQConfig Parameters +- `num_faqs`: Number of FAQs to generate (default: 5) +- `target_audience`: Target audience level (default: INTERMEDIATE) +- `faq_style`: Writing style (default: PROFESSIONAL) +- `include_emojis`: Whether to include emojis (default: True) +- `include_code_examples`: Whether to include code examples (default: True) +- `include_references`: Whether to include references (default: True) +- `search_depth`: Research depth level (default: COMPREHENSIVE) +- `time_range`: Time range for research (default: "last_6_months") +- `language`: Output language (default: "English") + +## Research Depth Options + +### Basic (Google Search) +- Quick, general information +- Broad coverage +- Suitable for basic topics + +### Comprehensive (Tavily AI) +- Detailed, in-depth research +- Multiple source integration +- Best for most use cases + +### Expert (Metaphor AI) +- Specialized, expert-level content +- Advanced topic coverage +- Technical and academic focus + +## Best Practices + +1. **Content Preparation** + - Provide clear, well-structured content + - Include key terms and concepts + - Specify target audience and style + +2. **Research Selection** + - Use Basic for general topics + - Choose Comprehensive for detailed analysis + - Select Expert for technical subjects + +3. **Output Review** + - Verify accuracy of information + - Check code examples + - Validate references + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Commit your changes +4. Push to the branch +5. Create a Pull Request + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + +## Support + +For support, please open an issue in the repository or contact the maintainers. + +## Acknowledgments + +- OpenAI for GPT integration +- Google Search API +- Tavily AI +- Metaphor AI +- BeautifulSoup for web scraping +- Streamlit for UI \ No newline at end of file diff --git a/lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py b/lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py new file mode 100644 index 00000000..8a8f8ea6 --- /dev/null +++ b/lib/ai_writers/ai_blog_faqs_writer/faqs_generator_blog.py @@ -0,0 +1,386 @@ +""" +Enhanced FAQ Generator + +This module provides a comprehensive FAQ generation system that can create detailed, +well-researched FAQs from various content sources with customizable options. +""" + +import sys +import json +from typing import Dict, List, Optional, Union +from pathlib import Path +from enum import Enum +from dataclasses import dataclass +from loguru import logger + +from lib.gpt_providers.text_generation.main_text_generation import llm_text_gen +from lib.ai_web_researcher.google_serp_search import google_search +from lib.ai_web_researcher.tavily_ai_search import tavily_search +from lib.ai_web_researcher.metaphor_basic_neural_web_search import metaphor_search_articles + +logger.remove() +logger.add(sys.stdout, + colorize=True, + format="{level}|{file}:{line}:{function}| {message}") + +class TargetAudience(Enum): + BEGINNER = "beginner" + INTERMEDIATE = "intermediate" + EXPERT = "expert" + +class FAQStyle(Enum): + TECHNICAL = "technical" + CONVERSATIONAL = "conversational" + PROFESSIONAL = "professional" + +class SearchDepth(Enum): + BASIC = "basic" + COMPREHENSIVE = "comprehensive" + EXPERT = "expert" + +@dataclass +class FAQConfig: + """Configuration for FAQ generation.""" + num_faqs: int = 5 + target_audience: TargetAudience = TargetAudience.INTERMEDIATE + faq_style: FAQStyle = FAQStyle.PROFESSIONAL + include_emojis: bool = True + include_code_examples: bool = True + include_references: bool = True + search_depth: SearchDepth = SearchDepth.COMPREHENSIVE + time_range: str = "last_6_months" + exclude_domains: List[str] = None + language: str = "English" + +@dataclass +class FAQItem: + """Individual FAQ item with metadata.""" + question: str + answer: str + category: str + code_example: Optional[str] = None + references: List[Dict[str, str]] = None + confidence_score: float = 0.0 + last_updated: str = None + +class FAQGenerator: + """Enhanced FAQ Generator with research capabilities.""" + + def __init__(self, config: Optional[FAQConfig] = None): + """Initialize the FAQ generator with optional configuration.""" + self.config = config or FAQConfig() + self.faqs: List[FAQItem] = [] + self.research_results = {} + + async def generate_faqs(self, content: str, content_type: str = "general") -> List[FAQItem]: + """Generate FAQs from the given content with research integration.""" + try: + # Step 1: Research the topic + research_results = await self._conduct_research(content) + + # Step 2: Generate initial FAQs + initial_faqs = await self._generate_initial_faqs(content, research_results) + + # Step 3: Enhance FAQs with research + enhanced_faqs = await self._enhance_faqs_with_research(initial_faqs, research_results) + + # Step 4: Add code examples if requested + if self.config.include_code_examples: + enhanced_faqs = await self._add_code_examples(enhanced_faqs) + + # Step 5: Add references if requested + if self.config.include_references: + enhanced_faqs = await self._add_references(enhanced_faqs, research_results) + + self.faqs = enhanced_faqs + return enhanced_faqs + + except Exception as err: + logger.error(f"Failed to generate FAQs: {err}") + raise + + async def _conduct_research(self, content: str) -> Dict: + """Conduct online research based on the content.""" + try: + research_prompt = f"""Based on the following content, identify key topics and questions for research: + {content} + + Please provide a list of research topics and questions that would help create comprehensive FAQs. + Focus on: + 1. Key concepts and terms + 2. Common questions users might have + 3. Technical aspects that need clarification + 4. Best practices and recommendations + """ + + research_topics = await llm_text_gen(research_prompt) + + # Conduct research for each topic + research_results = {} + for topic in research_topics.split('\n'): + if topic.strip(): + # Select search function based on search depth + if self.config.search_depth == SearchDepth.BASIC: + results = await google_search(topic.strip()) + elif self.config.search_depth == SearchDepth.COMPREHENSIVE: + results = await tavily_search(topic.strip()) + elif self.config.search_depth == SearchDepth.EXPERT: + results = await metaphor_search_articles(topic.strip()) + else: + logger.warning(f"Unknown search depth: {self.config.search_depth}, defaulting to Google search") + results = await google_search(topic.strip()) + + research_results[topic.strip()] = results + + return research_results + + except Exception as err: + logger.error(f"Failed to conduct research: {err}") + return {} + + async def _generate_initial_faqs(self, content: str, research_results: Dict) -> List[FAQItem]: + """Generate initial FAQs using LLM.""" + try: + system_prompt = f"""You are an expert FAQ generator with deep knowledge in content creation and technical writing. + Your task is to create comprehensive FAQs based on the given content and research. + + Guidelines: + 1. Target Audience: {self.config.target_audience.value} + 2. Style: {self.config.faq_style.value} + 3. Include emojis: {self.config.include_emojis} + 4. Language: {self.config.language} + 5. Number of FAQs: {self.config.num_faqs} + + Create FAQs that are: + - Clear and concise + - Well-structured + - Technically accurate + - Engaging and informative + - Based on the provided research + - Relevant to the target audience + - Written in the specified style + """ + + prompt = f"""Content to generate FAQs from: + {content} + + Research Results: + {json.dumps(research_results, indent=2)} + + Please generate {self.config.num_faqs} FAQs following the guidelines above. + Format each FAQ with: + - Question + - Detailed answer + - Category + - Confidence score (0-1) + """ + + response = await llm_text_gen(prompt, system_prompt=system_prompt) + + # Parse the response into FAQItem objects + faqs = [] + current_faq = None + + for line in response.split('\n'): + if line.startswith('Q:'): + if current_faq: + faqs.append(current_faq) + current_faq = FAQItem(question=line[2:].strip(), answer="", category="") + elif line.startswith('A:'): + if current_faq: + current_faq.answer = line[2:].strip() + elif line.startswith('Category:'): + if current_faq: + current_faq.category = line[9:].strip() + elif line.startswith('Confidence:'): + if current_faq: + current_faq.confidence_score = float(line[11:].strip()) + + if current_faq: + faqs.append(current_faq) + + return faqs + + except Exception as err: + logger.error(f"Failed to generate initial FAQs: {err}") + raise + + async def _enhance_faqs_with_research(self, faqs: List[FAQItem], research_results: Dict) -> List[FAQItem]: + """Enhance FAQs with research findings.""" + try: + enhanced_faqs = [] + + for faq in faqs: + # Find relevant research for this FAQ + relevant_research = self._find_relevant_research(faq, research_results) + + if relevant_research: + # Enhance the answer with research findings + enhancement_prompt = f"""Enhance the following FAQ answer with the provided research: + + Question: {faq.question} + Current Answer: {faq.answer} + + Research: + {json.dumps(relevant_research, indent=2)} + + Please enhance the answer while: + 1. Maintaining the original style and tone + 2. Adding relevant information from the research + 3. Ensuring technical accuracy + 4. Keeping the answer concise and clear + """ + + enhanced_answer = await llm_text_gen(enhancement_prompt) + faq.answer = enhanced_answer + + enhanced_faqs.append(faq) + + return enhanced_faqs + + except Exception as err: + logger.error(f"Failed to enhance FAQs with research: {err}") + return faqs + + async def _add_code_examples(self, faqs: List[FAQItem]) -> List[FAQItem]: + """Add code examples to FAQs where applicable.""" + try: + for faq in faqs: + if self._is_technical_question(faq.question): + code_prompt = f"""Generate a code example for the following FAQ: + + Question: {faq.question} + Answer: {faq.answer} + + Please provide a relevant code example that: + 1. Illustrates the answer clearly + 2. Includes comments and explanations + 3. Follows best practices + 4. Is easy to understand + """ + + code_example = await llm_text_gen(code_prompt) + faq.code_example = code_example + + return faqs + + except Exception as err: + logger.error(f"Failed to add code examples: {err}") + return faqs + + async def _add_references(self, faqs: List[FAQItem], research_results: Dict) -> List[FAQItem]: + """Add references to FAQs.""" + try: + for faq in faqs: + relevant_research = self._find_relevant_research(faq, research_results) + if relevant_research: + faq.references = [ + { + "title": ref.get("title", ""), + "url": ref.get("url", ""), + "source": ref.get("source", ""), + "date": ref.get("date", "") + } + for ref in relevant_research.get("references", []) + ] + + return faqs + + except Exception as err: + logger.error(f"Failed to add references: {err}") + return faqs + + def _find_relevant_research(self, faq: FAQItem, research_results: Dict) -> Dict: + """Find research relevant to a specific FAQ.""" + # Simple keyword matching for now - can be enhanced with semantic search + relevant_research = {} + for topic, results in research_results.items(): + if any(keyword in faq.question.lower() for keyword in topic.lower().split()): + relevant_research[topic] = results + return relevant_research + + def _is_technical_question(self, question: str) -> bool: + """Determine if a question is technical and might benefit from a code example.""" + technical_keywords = ["code", "program", "function", "method", "class", "api", "syntax", "error", "debug"] + return any(keyword in question.lower() for keyword in technical_keywords) + + def to_markdown(self) -> str: + """Convert FAQs to markdown format.""" + markdown = "# Frequently Asked Questions\n\n" + + for i, faq in enumerate(self.faqs, 1): + markdown += f"## {i}. {faq.question}\n\n" + markdown += f"{faq.answer}\n\n" + + if faq.code_example: + markdown += "```\n" + markdown += f"{faq.code_example}\n" + markdown += "```\n\n" + + if faq.references: + markdown += "### References\n" + for ref in faq.references: + markdown += f"- [{ref['title']}]({ref['url']}) - {ref['source']} ({ref['date']})\n" + markdown += "\n" + + return markdown + + def to_html(self) -> str: + """Convert FAQs to HTML format.""" + html = """ + + + + Frequently Asked Questions + + + +
+

Frequently Asked Questions

+ """ + + for i, faq in enumerate(self.faqs, 1): + html += f""" +
+
{i}. {faq.question}
+
{faq.answer}
+ """ + + if faq.code_example: + html += f""" +
{faq.code_example}
+ """ + + if faq.references: + html += """ +
+

References

+
    + """ + for ref in faq.references: + html += f""" +
  • {ref['title']} - {ref['source']} ({ref['date']})
  • + """ + html += """ +
+
+ """ + + html += """ +
+ """ + + html += """ +
+ + + """ + + return html diff --git a/lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py b/lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py new file mode 100644 index 00000000..720cb91b --- /dev/null +++ b/lib/ai_writers/ai_blog_faqs_writer/faqs_ui.py @@ -0,0 +1,177 @@ +""" +Streamlit UI for FAQ Generator + +This module provides a user-friendly interface for generating FAQs from various content sources. +""" + +import streamlit as st +import asyncio +from pathlib import Path +from typing import Optional +import json +import requests +from bs4 import BeautifulSoup + +from .faqs_generator_blog import FAQGenerator, FAQConfig, TargetAudience, FAQStyle, SearchDepth + + +def fetch_url_content(url): + """Fetch and extract content from a URL.""" + try: + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + # Remove script and style elements + for script in soup(["script", "style"]): + script.decompose() + + # Get text + text = soup.get_text() + + # Break into lines and remove leading and trailing space + lines = (line.strip() for line in text.splitlines()) + # Break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # Drop blank lines + text = '\n'.join(chunk for chunk in chunks if chunk) + + return text + except Exception as e: + st.error(f"Error fetching URL content: {str(e)}") + return None + +def main(): + st.set_page_config( + page_title="FAQ Generator", + page_icon="❓", + layout="wide" + ) + + st.title("FAQ Generator") + st.markdown("Generate comprehensive FAQs from your content with research integration.") + + # Sidebar for configuration + with st.sidebar: + st.header("Configuration") + + # Basic settings + num_faqs = st.slider("Number of FAQs", 1, 20, 5) + target_audience = st.selectbox( + "Target Audience", + [audience.value for audience in TargetAudience] + ) + faq_style = st.selectbox( + "FAQ Style", + [style.value for style in FAQStyle] + ) + + # Advanced settings + with st.expander("Advanced Settings"): + include_emojis = st.checkbox("Include Emojis", value=True) + include_code_examples = st.checkbox("Include Code Examples", value=True) + include_references = st.checkbox("Include References", value=True) + + search_depth = st.selectbox( + "Search Depth", + [depth.value for depth in SearchDepth] + ) + time_range = st.selectbox( + "Time Range", + ["last_month", "last_6_months", "last_year", "all_time"] + ) + language = st.text_input("Language", value="English") + + # Main content area + content_type = st.radio( + "Content Source", + ["Direct Input", "File Upload", "URL"] + ) + + content = "" + if content_type == "Direct Input": + content = st.text_area("Enter your content", height=300) + + elif content_type == "URL": + url = st.text_input("Enter URL") + if url: + content = fetch_url_content(url) + if content: + st.text_area("Extracted Content", content, height=300) + + # Generate button + if st.button("Generate FAQs") and content: + try: + # Create config + config = FAQConfig( + num_faqs=num_faqs, + target_audience=TargetAudience(target_audience), + faq_style=FAQStyle(faq_style), + include_emojis=include_emojis, + include_code_examples=include_code_examples, + include_references=include_references, + search_depth=SearchDepth(search_depth), + time_range=time_range, + language=language + ) + + # Initialize generator + generator = FAQGenerator(config) + + # Generate FAQs + with st.spinner("Generating FAQs..."): + faqs = asyncio.run(generator.generate_faqs(content)) + + # Display results + st.success("FAQs generated successfully!") + + # Output format selection + output_format = st.radio( + "Output Format", + ["Preview", "Markdown", "HTML", "JSON"] + ) + + if output_format == "Preview": + for i, faq in enumerate(faqs, 1): + with st.expander(f"{i}. {faq.question}"): + st.markdown(faq.answer) + if faq.code_example: + st.code(faq.code_example) + if faq.references: + st.markdown("**References:**") + for ref in faq.references: + st.markdown(f"- [{ref['title']}]({ref['url']}) - {ref['source']} ({ref['date']})") + + elif output_format == "Markdown": + st.code(generator.to_markdown(), language="markdown") + st.download_button( + "Download Markdown", + generator.to_markdown(), + file_name="faqs.md", + mime="text/markdown" + ) + + elif output_format == "HTML": + st.code(generator.to_html(), language="html") + st.download_button( + "Download HTML", + generator.to_html(), + file_name="faqs.html", + mime="text/html" + ) + + elif output_format == "JSON": + json_output = json.dumps([faq.__dict__ for faq in faqs], indent=2) + st.code(json_output, language="json") + st.download_button( + "Download JSON", + json_output, + file_name="faqs.json", + mime="application/json" + ) + + except Exception as e: + st.error(f"Error generating FAQs: {str(e)}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/lib/ai_writers/ai_writer_dashboard.py b/lib/ai_writers/ai_writer_dashboard.py index a4f116fe..ba157a9a 100644 --- a/lib/ai_writers/ai_writer_dashboard.py +++ b/lib/ai_writers/ai_writer_dashboard.py @@ -6,7 +6,7 @@ from lib.ai_writers.ai_product_description_writer import write_ai_prod_desc from lib.ai_writers.ai_copywriter.copywriter_dashboard import copywriter_dashboard from lib.ai_writers.linkedin_writer import LinkedInAIWriter from lib.ai_writers.blog_rewriter_updater.ai_blog_rewriter import write_blog_rewriter -#from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_content_planner +from lib.ai_writers.ai_blog_faqs_writer.faqs_ui import main as faqs_generator from lib.ai_writers.ai_blog_writer.ai_blog_generator import ai_blog_writer_page from loguru import logger @@ -84,6 +84,14 @@ def list_ai_writers(): "category": "Professional", "function": lambda: LinkedInAIWriter().run(), "path": "linkedin_writer" + }, + { + "name": "FAQ Generator", + "icon": "❓", + "description": "Generate comprehensive, well-researched FAQs from any content source with customizable options", + "category": "Content Creation", + "function": faqs_generator, + "path": "faqs_generator" } ] diff --git a/lib/ai_writers/github_blogs/README.md b/lib/ai_writers/github_blogs/README.md new file mode 100644 index 00000000..194490a1 --- /dev/null +++ b/lib/ai_writers/github_blogs/README.md @@ -0,0 +1,259 @@ +# GitHub Blog Generator + +A powerful AI-powered content generation system that automatically creates comprehensive documentation, tutorials, and guides from GitHub repositories. This module transforms GitHub repository data into various types of high-quality technical content. + +## Features + +### 1. Content Generation Types + +The system can generate the following types of content from GitHub repositories: + +- **Getting Started Guides** + - Introduction and Overview + - Prerequisites and Setup + - Installation Instructions + - Basic Usage Examples + - Common Use Cases + - Best Practices + - Next Steps and Resources + +- **Technical Documentation** + - Architecture Overview + - Core Components + - Technical Specifications + - Integration Points + - Performance Considerations + - Security Features + - API Documentation + - Configuration Options + - Deployment Guidelines + - Troubleshooting Guide + +- **Tutorial Series** + - Beginner Tutorials + - Basic concepts + - Simple examples + - Step-by-step instructions + - Intermediate Tutorials + - Advanced features + - Real-world examples + - Best practices + - Advanced Tutorials + - Complex use cases + - Performance optimization + - Integration patterns + +- **Comparison Analysis** + - Feature Comparison + - Performance Analysis + - Use Case Suitability + - Community and Support + - Learning Curve + - Integration Capabilities + - Future Prospects + +- **Case Studies** + - Problem Statement + - Solution Implementation + - Technical Challenges + - Results and Benefits + - Lessons Learned + - Future Improvements + +- **Contribution Guides** + - Development Setup + - Code Style Guidelines + - Testing Requirements + - Documentation Standards + - Pull Request Process + - Review Guidelines + - Community Guidelines + +- **Security Guides** + - Security Architecture + - Authentication & Authorization + - Data Protection + - Secure Configuration + - Vulnerability Management + - Incident Response + - Compliance Requirements + +- **Performance Guides** + - Performance Metrics + - Optimization Techniques + - Benchmarking Guidelines + - Resource Management + - Scaling Strategies + - Monitoring Setup + - Troubleshooting + +### 2. GitHub Content Scraping + +The module includes a sophisticated GitHub content scraper with the following capabilities: + +- **Rate Limiting** + - Configurable API call limits + - Automatic request throttling + - Concurrent request management + +- **Caching System** + - Configurable cache duration (TTL) + - Automatic cache invalidation + - Efficient storage of scraped content + +- **Content Extraction** + - Repository metadata + - README content + - File contents + - Repository topics + - Contributor information + - License information + +### 3. Content Enhancement + +- **Online Research Integration** + - Automatic topic research + - Related content discovery + - Industry trend analysis + +- **FAQ Generation** + - Automatic FAQ creation + - Common question identification + - Comprehensive answers + +- **Metadata Generation** + - SEO-optimized titles + - Meta descriptions + - Tags and categories + - Content structuring + +## Usage Examples + +### Basic Usage + +```python +from lib.ai_writers.github_blogs import GitHubBlogGenerator + +# Initialize the generator +generator = GitHubBlogGenerator() + +# Generate content for a GitHub repository +content = await generator.generate_content( + github_url="https://github.com/owner/repo", + content_types=["getting_started", "technical_docs", "tutorials"] +) + +# Save the generated content +generator.save_content(content, "my_repository") +``` + +### Advanced Usage + +```python +from lib.ai_writers.github_blogs import GitHubBlogGenerator + +# Initialize with custom settings +generator = GitHubBlogGenerator( + cache_dir=".custom_cache", + ttl_hours=48 +) + +# Generate all content types +content_types = [ + "getting_started", + "technical_docs", + "tutorials", + "comparison", + "case_studies", + "contribution", + "security", + "performance" +] + +# Generate content for multiple repositories +urls = [ + "https://github.com/owner/repo1", + "https://github.com/owner/repo2" +] + +for url in urls: + content = await generator.generate_content(url, content_types) + generator.save_content(content, url.split("/")[-1]) +``` + +## Configuration Options + +### GitHubBlogGenerator + +- `cache_dir` (str): Directory for caching scraped content (default: ".github_cache") +- `ttl_hours` (int): Time-to-live for cached content in hours (default: 24) + +### Content Generation + +- `gpt_provider` (str): Choice of AI provider ("gemini" or "openai") +- `content_types` (List[str]): Types of content to generate +- `github_url` (str): URL of the GitHub repository + +## Output Format + +All generated content is saved in Markdown format with the following structure: + +```markdown +# [Title] + +[Generated content based on content type] + +## Metadata +- Title: [SEO-optimized title] +- Description: [Meta description] +- Tags: [Generated tags] +- Categories: [Generated categories] +``` + +## Best Practices + +1. **Rate Limiting** + - Configure appropriate rate limits based on your GitHub API quota + - Use caching to minimize API calls + - Implement proper error handling for rate limit exceeded scenarios + +2. **Content Generation** + - Start with basic content types before generating advanced content + - Review generated content for accuracy and completeness + - Customize prompts for specific repository types + +3. **Caching** + - Set appropriate TTL based on repository update frequency + - Clear cache when repository content changes significantly + - Monitor cache size and performance + +4. **Error Handling** + - Implement proper error handling for API failures + - Log errors for debugging + - Provide fallback mechanisms for failed content generation + +## Dependencies + +- Python 3.8+ +- aiohttp +- beautifulsoup4 +- loguru +- pydantic +- requests +- pandas + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Commit your changes +4. Push to the branch +5. Create a Pull Request + +## License + +[Your License Here] + +## Support + +For support, please [create an issue](https://github.com/your-repo/issues) or contact the maintainers. \ No newline at end of file diff --git a/lib/ai_writers/github_blogs/github_getting_started.py b/lib/ai_writers/github_blogs/github_getting_started.py index 17ecc201..81b247b5 100644 --- a/lib/ai_writers/github_blogs/github_getting_started.py +++ b/lib/ai_writers/github_blogs/github_getting_started.py @@ -1,39 +1,254 @@ +""" +Enhanced GitHub Content Generator + +This module provides various content generation capabilities from GitHub repository data, +including getting started guides, technical documentation, tutorials, and more. +""" + import sys - -from .gpt_providers.openai_chat_completion import openai_chatgpt -from .gpt_providers.gemini_pro_text import gemini_text_response - +from typing import Dict, List, Optional from loguru import logger + +from lib.gpt_providers.text_generation.main_text_generation import llm_text_gen + logger.remove() logger.add(sys.stdout, - colorize=True, - format="{level}|{file}:{line}:{function}| {message}" - ) + colorize=True, + format="{level}|{file}:{line}:{function}| {message}") +def generate_technical_documentation(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate comprehensive technical documentation from repository data.""" + prompt = f"""As an expert technical writer, create detailed technical documentation for the following GitHub repository: +Repository Data: +{repo_data} -def github_readme_blog(readme_content): - """ """ - prompt = f"""As an expert programmer and teacher, Write an original, detailed and step-by-step guide, from the provided Text below. - Your guide should be original, engaging and help beginners get started easily. - Write new example codes and detailed comments on how to run them. Include appropriate emoji where applicable. - Include a referances section that links to more code examples. - Your response MUST be a how-to blog in markdown format. - Respond ONLY with your blog content. +Please create a comprehensive technical documentation that includes: +1. Architecture Overview +2. Core Components +3. Technical Specifications +4. Integration Points +5. Performance Considerations +6. Security Features +7. API Documentation (if applicable) +8. Configuration Options +9. Deployment Guidelines +10. Troubleshooting Guide - Text: '{readme_content}' - """ - if 'gemini' in gpt_providers: - try: - response = gemini_text_response(prompt) - return response - except Exception as err: - logger.error(f"Failed to get response from gemini: {err}") - sys.exit(1) - elif 'openai' in gpt_providers: - try: - logger.info("Calling OpenAI LLM.") - response = openai_chatgpt(prompt) - return response - except Exception as err: - SystemError(f"Failed to get response from Openai: {err}") +Format the documentation in markdown with appropriate headers, code blocks, and diagrams. +Include real-world examples and best practices. +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_getting_started_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate a beginner-friendly getting started guide.""" + prompt = f"""As an expert programmer and teacher, create a comprehensive getting started guide for the following GitHub repository: + +Repository Data: +{repo_data} + +Create a step-by-step guide that includes: +1. Introduction and Overview +2. Prerequisites and Setup +3. Installation Instructions +4. Basic Usage Examples +5. Common Use Cases +6. Best Practices +7. Next Steps and Resources + +Make the guide: +- Beginner-friendly with clear explanations +- Include practical examples with code snippets +- Add emojis for better readability +- Include troubleshooting tips +- Provide links to additional resources +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_tutorial_series(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate a series of tutorials for different skill levels.""" + prompt = f"""As an expert educator, create a series of tutorials for the following GitHub repository: + +Repository Data: +{repo_data} + +Create a structured tutorial series that includes: +1. Beginner Tutorial + - Basic concepts + - Simple examples + - Step-by-step instructions + +2. Intermediate Tutorial + - Advanced features + - Real-world examples + - Best practices + +3. Advanced Tutorial + - Complex use cases + - Performance optimization + - Integration patterns + +Each tutorial should: +- Be self-contained +- Include practical examples +- Have clear learning objectives +- Include exercises and challenges +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_comparison_analysis(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate a comparison analysis with similar tools/frameworks.""" + prompt = f"""As a technical analyst, create a comprehensive comparison analysis for the following GitHub repository: + +Repository Data: +{repo_data} + +Create a detailed comparison that includes: +1. Feature Comparison +2. Performance Analysis +3. Use Case Suitability +4. Community and Support +5. Learning Curve +6. Integration Capabilities +7. Future Prospects + +Include: +- Pros and Cons +- Real-world use cases +- Industry adoption +- Community feedback +- Future roadmap +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_case_studies(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate real-world case studies and success stories.""" + prompt = f"""As a technical writer, create compelling case studies for the following GitHub repository: + +Repository Data: +{repo_data} + +Create detailed case studies that include: +1. Problem Statement +2. Solution Implementation +3. Technical Challenges +4. Results and Benefits +5. Lessons Learned +6. Future Improvements + +Make the case studies: +- Based on real-world scenarios +- Include technical details +- Show measurable results +- Provide actionable insights +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_contribution_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate a comprehensive contribution guide.""" + prompt = f"""As an open-source maintainer, create a detailed contribution guide for the following GitHub repository: + +Repository Data: +{repo_data} + +Create a contribution guide that includes: +1. Development Setup +2. Code Style Guidelines +3. Testing Requirements +4. Documentation Standards +5. Pull Request Process +6. Review Guidelines +7. Community Guidelines + +Make the guide: +- Clear and concise +- Include examples +- Cover all contribution types +- Provide templates +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_security_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate a security best practices guide.""" + prompt = f"""As a security expert, create a comprehensive security guide for the following GitHub repository: + +Repository Data: +{repo_data} + +Create a security guide that includes: +1. Security Architecture +2. Authentication & Authorization +3. Data Protection +4. Secure Configuration +5. Vulnerability Management +6. Incident Response +7. Compliance Requirements + +Make the guide: +- Practical and actionable +- Include security checklists +- Provide code examples +- Cover common vulnerabilities +""" + return _get_llm_response(prompt, gpt_provider) + +def generate_performance_guide(repo_data: Dict, gpt_provider: str = "gemini") -> str: + """Generate a performance optimization guide.""" + prompt = f"""As a performance optimization expert, create a detailed performance guide for the following GitHub repository: + +Repository Data: +{repo_data} + +Create a performance guide that includes: +1. Performance Metrics +2. Optimization Techniques +3. Benchmarking Guidelines +4. Resource Management +5. Scaling Strategies +6. Monitoring Setup +7. Troubleshooting + +Make the guide: +- Data-driven +- Include benchmarks +- Provide optimization tips +- Cover different scales +""" + return _get_llm_response(prompt, gpt_provider) + +def _get_llm_response(prompt: str, gpt_provider: str) -> str: + """Get response from the specified LLM provider.""" + system_prompt = """You are an expert technical writer and GitHub repository analyst with deep expertise in software development, documentation, and technical communication. + + Your role is to create high-quality, accurate, and engaging content based on GitHub repository data. You should: + + 1. **Technical Accuracy** + - Ensure all technical information is precise and up-to-date + - Verify code examples and configurations + - Cross-reference documentation and source code + - Maintain consistency with repository standards + + 2. **Content Structure** + - Use clear hierarchical organization + - Include appropriate code blocks and examples + - Add relevant diagrams and visual aids + - Break complex topics into digestible sections + + 3. **Writing Style** + - Maintain a professional yet approachable tone + - Use active voice and clear language + - Include practical examples and use cases + - Add relevant emojis for better readability + + 4. **Best Practices** + - Follow industry-standard documentation practices + - Include troubleshooting sections + - Add performance considerations + - Address security implications +""" + try: + + llm_response = llm_text_gen(prompt, system_prompt=system_prompt) + except Exception as err: + logger.error(f"Failed to get response from {gpt_provider}: {err}") + raise diff --git a/lib/ai_writers/github_blogs/main_getting_started_blogs.py b/lib/ai_writers/github_blogs/main_getting_started_blogs.py index c397fefc..5d84a565 100644 --- a/lib/ai_writers/github_blogs/main_getting_started_blogs.py +++ b/lib/ai_writers/github_blogs/main_getting_started_blogs.py @@ -1,140 +1,157 @@ -""" Package for writing getting-started and how to guides. """ +""" +Enhanced GitHub Blog Generator + +This module provides comprehensive content generation from GitHub repositories, +including technical documentation, tutorials, case studies, and more. +""" import os import sys import datetime import json +from typing import Dict, List, Optional +from pathlib import Path from loguru import logger logger.remove() logger.add(sys.stdout, - colorize=True, - format="{level}|{file}:{line}:{function}| {message}" - ) + colorize=True, + format="{level}|{file}:{line}:{function}| {message}") +from .scrape_github_readme import GitHubScraper, GitHubContent from .scrape_github_readme import get_gh_details_vision, get_readme_content from .scrape_github_readme import research_github_topics, check_if_already_written -from .github_getting_started import github_readme_blog -from .gpt_online_researcher import do_online_research -from .faqs_generator_blog import generate_blog_faq -from .get_blog_metadata import blog_metadata -from .save_blog_to_file import save_blog_to_file -from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file +from .github_getting_started import ( + generate_technical_documentation, + generate_getting_started_guide, + generate_tutorial_series, + generate_comparison_analysis, + generate_case_studies, + generate_contribution_guide, + generate_security_guide, + generate_performance_guide +) - -def blog_from_github(github_opts, flag): - """ Module for writing getting started code examples from github. """ - if 'url' in flag: - try: - write_from_url(github_opts) - except Exception as err: - logger.error(f"Failed to write from github url: {github_opts}") - sys.exit(1) - elif 'csv' in flag: - try: - gh_urls = [] - with open(github_opts, 'r', encoding="utf-8") as file: - # Read each line in the file - for gh_url in file: - gh_urls.append(gh_url.strip()) - except FileNotFoundError: - logger.error(f"CSV File not found: {file_path}") - except Exception as e: - logger.error(f"CSV: An error occurred: {str(e)}") - - for gh_url in gh_urls: - try: - write_from_url(gh_url.strip()) - except Exception as err: - logger.error(f"Failed to write blog from github: {err}") - - - -def write_from_url(gh_url): - # String to store the blog content. - howto_blog = '' - # The url was not found in already_written data. - if not check_if_already_written(gh_url): - logger.info(f"Writing getting started from url: {gh_url}") - else: - logger.error(f"Skipping, already written on url: {gh_url}") - return - - # Direct link to the raw content of README file - # fixme: Remove the hardcoding, need add another option OR in config ? - image_dir = os.path.join(os.getcwd(), "blog_images") - generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png" - generated_image_filepath = os.path.join(image_dir, generated_image_name) - try: - logger.info(f"Getting github repo details from vision model: {generated_image_filepath}") - gh_json = get_gh_details_vision(gh_url, generated_image_filepath) - except Exception as err: - logger.error(f"Failed to get gemini vision details from GH repo image: {err}") - sys.exit(1) - howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n" - howto_blog += f"Forks:{gh_json.get('forks')}\n" - howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n" - - raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:]) - if gh_json.get('branch_name'): - raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md" - else: - raw_readme_url = raw_readme_url_base + f"/main/" + "README.md" - logger.info(f"Using this url to fetch the README file: {raw_readme_url}") - - try: - # Get and print the main content - readme_content = get_readme_content(raw_readme_url) - except Exception as err: - logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}") - # If the readme is still None, try with master branch. - if not readme_content: - raw_readme_url = raw_readme_url_base + f"/master/" + "README.md" - logger.warning(f"Trying with master branch: {raw_readme_url}") - readme_content = get_readme_content(raw_readme_url) - if not readme_content: - logger.error(f"Still failed to get the README: {readme_content}") - sys.exit(1) +class GitHubBlogGenerator: + """Generator for various types of GitHub-related content.""" - # Create a getting-started blog, adapted from the GH url README. - howto_blog += github_readme_blog(readme_content, "gemini") + def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24): + """Initialize the blog generator.""" + self.cache_dir = Path(cache_dir) + self.scraper = GitHubScraper(cache_dir, ttl_hours) + self.output_dir = Path("generated_content") + self.output_dir.mkdir(exist_ok=True) + + async def generate_content(self, github_url: str, content_types: List[str] = None) -> Dict[str, str]: + """Generate various types of content from a GitHub repository.""" + if content_types is None: + content_types = ["getting_started", "technical_docs", "tutorials"] + + try: + # Scrape GitHub content + repo_content = await self.scraper.scrape_github_content(github_url) + + # Generate different types of content + generated_content = {} + + for content_type in content_types: + if content_type == "getting_started": + content = generate_getting_started_guide(repo_content.dict()) + elif content_type == "technical_docs": + content = generate_technical_documentation(repo_content.dict()) + elif content_type == "tutorials": + content = generate_tutorial_series(repo_content.dict()) + elif content_type == "comparison": + content = generate_comparison_analysis(repo_content.dict()) + elif content_type == "case_studies": + content = generate_case_studies(repo_content.dict()) + elif content_type == "contribution": + content = generate_contribution_guide(repo_content.dict()) + elif content_type == "security": + content = generate_security_guide(repo_content.dict()) + elif content_type == "performance": + content = generate_performance_guide(repo_content.dict()) + else: + logger.warning(f"Unknown content type: {content_type}") + continue + + generated_content[content_type] = content + + # Generate FAQs from online research + try: + research_report = do_online_research(repo_content.title, "gemini", github_url) + faqs = generate_blog_faq(research_report, "gemini") + generated_content["faqs"] = faqs + except Exception as err: + logger.error(f"Failed to generate FAQs: {err}") + + return generated_content + + except Exception as err: + logger.error(f"Failed to generate content: {err}") + raise + + def save_content(self, content: Dict[str, str], base_filename: str): + """Save generated content to files.""" + try: + for content_type, content_text in content.items(): + # Generate metadata for each content type + title, meta_desc, tags, categories = blog_metadata(content_text, "gemini") + + # Create filename with content type + filename = f"{base_filename}_{content_type}.md" + + # Save content to file + save_blog_to_file( + content_text, + title, + meta_desc, + tags, + categories, + None # No image path for now + ) + + logger.info(f"Saved {content_type} content to {filename}") + + except Exception as err: + logger.error(f"Failed to save content: {err}") + raise - # Do online research for faqs on the github url. - try: - # Repo names are misnomers for others search, include its decription too. - # Which, skews the result favourably towards its home/paid pages. - #online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about') - online_query = f"{''.join(gh_url.split('/')[-1:])} " - logger.info("Do web research with Tavily & Metaphor AI.") - research_report = do_online_research(online_query, "gemini", gh_url) - except Exception as err: - logger.error(f"failed to do online research: {err}") +async def main(): + """Example usage of the GitHub blog generator.""" + generator = GitHubBlogGenerator() + + # Example GitHub URLs + urls = [ + "https://github.com/owner/repo", + "https://github.com/owner/another-repo" + ] + + content_types = [ + "getting_started", + "technical_docs", + "tutorials", + "comparison", + "case_studies", + "contribution", + "security", + "performance" + ] + + for url in urls: + try: + # Generate content + content = await generator.generate_content(url, content_types) + + # Create base filename from URL + base_filename = url.split("/")[-1] + + # Save content + generator.save_content(content, base_filename) + + except Exception as e: + logger.error(f"Error processing {url}: {e}") - # Generate FAQs from the online research report. - try: - blog_faqs = generate_blog_faq(research_report, "gemini") - howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs - except Exception as err: - logger.error(f"Failed to generate FAQs from web research_report: {err}") - - logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n") - - try: - blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini") - except Exception as err: - logger.error(f"Failed to get blog metadata: {err}") - raise err - - try: - save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\ - blog_categories, generated_image_filepath) - except Exception as err: - logger.error(f"Failed to save blog to a file: {err}") - sys.exit(1) - - try: - append_id_to_file(gh_url, "papers_already_written_on.txt") - except Exception as err: - logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}") - raise err +if __name__ == "__main__": + asyncio.run(main()) diff --git a/lib/ai_writers/github_blogs/scrape_github_readme.py b/lib/ai_writers/github_blogs/scrape_github_readme.py index 3e03958c..98efd98a 100644 --- a/lib/ai_writers/github_blogs/scrape_github_readme.py +++ b/lib/ai_writers/github_blogs/scrape_github_readme.py @@ -1,292 +1,422 @@ +""" +Enhanced GitHub Content Scraper with Rate Limiting and Caching + +This module provides functionality to scrape GitHub repositories, READMEs, and code files +for content marketing purposes. It includes async support, rate limiting, caching, +and comprehensive metadata collection. +""" + import os import sys -import datetime -import pandas as pd - import json -import requests +import asyncio +import aiohttp +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Union +from urllib.parse import urljoin, urlparse +import pandas as pd from bs4 import BeautifulSoup from loguru import logger +import requests +from pydantic import BaseModel, Field +import time +import pickle +from pathlib import Path + +# Configure logging logger.remove() logger.add(sys.stdout, colorize=True, - format="{level}|{file}:{line}:{function}| {message}" - ) + format="{level}|{file}:{line}:{function}| {message}") - -from .take_url_screenshot import take_screenshot -from .gpt_providers.gemini_image_details import gemini_get_img_info - - - -def get_readme_content(url): - try: - # Fetch the README content directly from the URL - response = requests.get(url) - print(response.status_code) - if response.status_code == 200: - logger.debug("Successfully fetched the README.md") - readme_content = response.text - else: - readme_content = None - return readme_content - except Exception as err: - logger.error(f"Failed to fetch raw readme from {url}: {err}: {response.status_code}") - sys.exit(1) - - -def get_gh_repo_metadata(github_url): - """ Function to get the repo details like stars, commits, forks etc """ - logger.info("Scraping github with BS4 and requests.") - # download the target page - page = requests.get(github_url) - # parse the HTML document returned by the server - soup = BeautifulSoup(page.text, 'html.parser') - - # initialize the object that will contain the scraped data - repo = {} - - # repo scraping logic - name_html_element = soup.select_one('[itemprop="name"]') - name = name_html_element.get_text().strip() - - git_branch_icon_html_element = soup.select_one('.octicon-git-branch') - main_branch_html_element = git_branch_icon_html_element.find_next_sibling('span') - main_branch = main_branch_html_element.get_text().strip() - - # scrape the repo history data - boxheader_html_element = soup.select_one('.Box .Box-header') - - # scrape the repo details in the right box - bordergrid_html_element = soup.select_one('.BorderGrid') - - about_html_element = bordergrid_html_element.select_one('h2') - description_html_element = about_html_element.find_next_sibling('p') - description = description_html_element.get_text().strip() - - star_icon_html_element = bordergrid_html_element.select_one('.octicon-star') - stars_html_element = star_icon_html_element.find_next_sibling('strong') - stars = stars_html_element.get_text().strip().replace(',', '') - - eye_icon_html_element = bordergrid_html_element.select_one('.octicon-eye') - watchers_html_element = eye_icon_html_element.find_next_sibling('strong') - watchers = watchers_html_element.get_text().strip().replace(',', '') - - fork_icon_html_element = bordergrid_html_element.select_one('.octicon-repo-forked') - forks_html_element = fork_icon_html_element.find_next_sibling('strong') - forks = forks_html_element.get_text().strip().replace(',', '') - - # Find the div with class "f6" containing topic links - topic_div = soup.find('div', class_='f6') - if topic_div: - # Find all the topic links within the div - topic_links = topic_div.find_all('a', class_='topic-tag-link') - # Extract and print the topics - repo['topics'] = [link.text.strip() for link in topic_links] - - # FIXME: Unable to scrape branch name. - repo['branch_name'] = None - # store the scraped data - repo['name'] = name - repo['about'] = description - repo['stars'] = stars - repo['watchers'] = watchers - repo['forks'] = forks - #repo['readme'] = readme - logger.info(f"Github Repo Details: {repo}") - return(repo) - - -def get_gh_details_vision(github_url, generated_image_filepath): - """ Take a screenshot of the url and feed to vision models for scraping details. """ - logger.info(f"Take screenshot and pass it to gemini for repo details of {github_url}") - - generated_image_filepath = take_screenshot(github_url, generated_image_filepath) - prompt = """From the given image of a github page, find out the number of stars, about, forks, last commit days, link url, topics and branch name. Return the result as json.""" +class RateLimiter: + """Rate limiter for GitHub API requests.""" - try: - gh_details = gemini_get_img_info(prompt, generated_image_filepath) - logger.info(f"Github Repo details, from vision model: {gh_details}") - #gh_details = get_gh_repo_metadata(github_url) - except Exception as err: - logger.error(f"Failed to get gh images details: {err}") - gh_details = get_gh_repo_metadata(github_url) - return gh_details - - # Convert string to dictionary Split the string into lines - lines = gh_details.split('\n') - # Remove the first and last line - modified_lines = lines[1:-1] - # Join the modified lines back into a string - gh_details = '\n'.join(modified_lines) - gh_details = json.loads(gh_details) - - return(gh_details) - - -def research_github_topics(topics): - """ Scrape github topics of interest for top repos to write on """ - # https://www.kaggle.com/code/subhaskumarray/scraping-github-topics-with-their-repositories - # We are going to scrape https://github.com/topics - # We will get a list of topics. For each topic, we will extract topic name, topic description and topic url. - # For each topic, we will get top 30 repositories with repo name, repo username, stars and repo url. - # Finally we are going to create csv file for each topic with respective repo details. - - #github_topics = "https://github.com/topics/" - #response = requests.get(github_topics) - #if response.status_code != 200: - # logger.error(f'There is something wrong with {url}') - #response_contents = response.text - # Now we will parse the contents using BeautifulSoup: - #parsed_contents = BeautifulSoup(response_contents,'html.parser') - #logger.info("Get all topics, Titles and their urls from github.") - #topic_titles = get_topic_titles(parsed_contents) - #topic_desc = get_topic_desc(parsed_contents) - #topic_urls = get_topic_url(parsed_contents) - #topic_df = pd.DataFrame(list(zip(topic_titles, topic_desc,topic_urls)),\ - # columns =['title', 'description', 'url']) - #logger.info(f"Scraped data from github: {topic_df}") - - gh_topics = ['ai', 'ai-tools', 'ai-assistant', 'ai-agents-framework', 'llm', 'multi-agent', 'fine-tuning', 'rag', 'generative', 'prompt-engineering', 'generative-ai', 'text-to-image-generation', 'llm-ops', 'retrieval-augmented-generation', 'langchain', 'gemini-api', 'vertex-ai', 'huggingface', 'auto-gpt', 'llmops', 'ai-toolkit', 'chatbot', 'chatgpt', 'code-assistant', 'text-to-video', 'llms', 'gpt-4'] - - repo_info_dict = { - 'username':[], - 'repo_name': [], - 'stars': [], - 'repo_url': [] - } - for agh_topic in gh_topics: - topic_url = f"https://github.com/topics/{agh_topic}" - first_topic_repo_page = download_repo_page(topic_url) - logger.info(f"Get details on github topic: {topic_url}") - repo_tags = first_topic_repo_page.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'}) - star_tags = first_topic_repo_page.find_all('span', {'class': 'Counter js-social-count'}) + def __init__(self, calls_per_minute: int = 30): + self.calls_per_minute = calls_per_minute + self.interval = 60 / calls_per_minute # seconds between calls + self.last_call_time = 0 + self.lock = asyncio.Lock() - for i in range(len(repo_tags)): - repo_details = get_repo_info(repo_tags[i], star_tags[i]) + async def acquire(self): + """Acquire rate limit token.""" + async with self.lock: + current_time = time.time() + time_since_last_call = current_time - self.last_call_time + + if time_since_last_call < self.interval: + await asyncio.sleep(self.interval - time_since_last_call) + + self.last_call_time = time.time() + +class Cache: + """Cache for GitHub content.""" + + def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24): + self.cache_dir = Path(cache_dir) + self.ttl = timedelta(hours=ttl_hours) + self.cache_dir.mkdir(exist_ok=True) + + def _get_cache_path(self, key: str) -> Path: + """Get cache file path for a key.""" + return self.cache_dir / f"{hash(key)}.cache" + + def get(self, key: str) -> Optional[Dict]: + """Get cached value for key.""" + cache_path = self._get_cache_path(key) - # Check if the repo URL is not already present in the dictionary - if repo_details[3] not in repo_info_dict['repo_url']: - # Store repos with more than 5000 stars. - if repo_details[2] > 5000: - repo_info_dict['username'].append(repo_details[0]) - repo_info_dict['repo_name'].append(repo_details[1]) - repo_info_dict['stars'].append(repo_details[2]) - repo_info_dict['repo_url'].append(repo_details[3]) - - # Create a DataFrame from repo_info_dict - df_repo_info = pd.DataFrame(repo_info_dict['repo_url']) - - # Check if the file already exists - csv_filename = 'github_url_to_write.csv' - if os.path.isfile(csv_filename): - # Append to the existing file - df_repo_info.to_csv(csv_filename, mode='a', header=False, index=False) - logger.info(f"Data appended to existing file: {csv_filename}") - else: - # Create a new file - df_repo_info.to_csv(csv_filename, index=False) - - -def get_topic_titles(parsed_content): - try: - selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary' - topic_title_tags = parsed_content.find_all('p',{'class':selected_class}) - # We can make a list of topics - topic_titles = [] - for tags in topic_title_tags: - topic_titles.append(tags.text) - return topic_titles - except Exception as err: - logger.error(f"Failed to get github topic titles: {err}") - - -def get_topic_desc(parsed_contents): - try: - desc_selector = 'f5 color-fg-muted mb-0 mt-1' - topic_desc_tags = parsed_contents.find_all('p',{'class': desc_selector}) - print(f"{topic_desc_tags}") - topic_desc = [] - for desc in topic_desc_tags: - print("dsfsfs") - topic_desc.append(desc.text.strip()) # strip() is used for trimming all extra spaces in description. - return topic_desc - except Exception as err: - logger.error(f"Failed to get github topic desc: {err}") - - -def get_topic_url(parsed_contents): - try: - topic_link_tag = parsed_contents.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'}) - topic_urls = [] - base_url = 'http://github.com' - for urls in topic_link_tag: - topic_urls.append(base_url + urls['href']) - return topic_urls - except Exception as err: - logger.error(f"Failed to get github topic urls: {err}") - - -def download_repo_page(topic_url): - response = requests.get(topic_url) - if response.status_code != 200: - print('There is some error in {}'.format(topic_url)) - response_contents = response.text + if not cache_path.exists(): + return None + + try: + with open(cache_path, 'rb') as f: + data = pickle.load(f) + if datetime.now() - data['timestamp'] > self.ttl: + cache_path.unlink() + return None + return data['value'] + except Exception as e: + logger.warning(f"Cache read error for {key}: {e}") + return None - parsed_contents = BeautifulSoup(response_contents,'html.parser') - return parsed_contents + def set(self, key: str, value: Dict): + """Set cache value for key.""" + cache_path = self._get_cache_path(key) + + try: + with open(cache_path, 'wb') as f: + pickle.dump({ + 'timestamp': datetime.now(), + 'value': value + }, f) + except Exception as e: + logger.warning(f"Cache write error for {key}: {e}") +class GitHubContent(BaseModel): + """Model for GitHub content analysis.""" + title: str = Field("", description="Title of the content") + description: str = Field("", description="Description of the content") + content: str = Field("", description="Main content") + language: str = Field("", description="Programming language") + stars: int = Field(0, description="Number of stars") + forks: int = Field(0, description="Number of forks") + watchers: int = Field(0, description="Number of watchers") + last_updated: str = Field("", description="Last update date") + topics: List[str] = Field([], description="Repository topics") + contributors: List[str] = Field([], description="Contributor usernames") + readme_url: str = Field("", description="URL of the README") + raw_content_url: str = Field("", description="URL for raw content") + license: str = Field("", description="Repository license") + dependencies: List[str] = Field([], description="Project dependencies") + metadata: Dict = Field({}, description="Additional metadata") -def get_repo_info(repo_tags,star_tags): - # returns all info for a repo - a_tags = repo_tags.find_all('a') - username = a_tags[0].text.strip() - repo_name = a_tags[1].text.strip() - base_url = 'http://github.com/' - repo_url = base_url + a_tags[1]['href'].strip() +class GitHubScraper: + """Service for scraping GitHub content with rate limiting and caching.""" - # Defining a function so that it will convert our star count to integer - def star_counts_converter(stars): - stars = stars.strip() - if stars[-1] == 'k': - return int(float(stars[:-1]) * 1000) - return int(stars) - star_counts = star_counts_converter(star_tags.text.strip()) - return username,repo_name,star_counts,repo_url + def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24, calls_per_minute: int = 30): + """Initialize the scraper service.""" + self.session = None + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'application/vnd.github.v3+json' + } + self.rate_limiter = RateLimiter(calls_per_minute) + self.cache = Cache(cache_dir, ttl_hours) + + async def __aenter__(self): + """Create aiohttp session when entering context.""" + self.session = aiohttp.ClientSession(headers=self.headers) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Close aiohttp session when exiting context.""" + if self.session: + await self.session.close() + + async def fetch_url(self, url: str, use_cache: bool = True) -> str: + """Fetch URL content asynchronously with rate limiting and caching.""" + if use_cache: + cached_content = self.cache.get(url) + if cached_content: + logger.debug(f"Cache hit for {url}") + return cached_content + + await self.rate_limiter.acquire() + + try: + async with self.session.get(url) as response: + if response.status == 200: + content = await response.text() + if use_cache: + self.cache.set(url, content) + return content + else: + error_msg = f"Failed to fetch URL: Status code {response.status}" + logger.error(error_msg) + raise Exception(error_msg) + except Exception as e: + logger.error(f"Error fetching URL {url}: {e}") + raise + + def parse_github_url(self, url: str) -> Dict[str, str]: + """Parse GitHub URL to extract repository information.""" + parsed = urlparse(url) + path_parts = parsed.path.strip('/').split('/') + + if len(path_parts) < 2: + raise ValueError("Invalid GitHub URL format") + + return { + 'owner': path_parts[0], + 'repo': path_parts[1], + 'branch': path_parts[3] if len(path_parts) > 3 else 'main', + 'path': '/'.join(path_parts[4:]) if len(path_parts) > 4 else '' + } + + async def get_repo_metadata(self, owner: str, repo: str) -> Dict: + """Get repository metadata from GitHub API with caching.""" + cache_key = f"metadata_{owner}_{repo}" + cached_metadata = self.cache.get(cache_key) + if cached_metadata: + return cached_metadata + + await self.rate_limiter.acquire() + + api_url = f"https://api.github.com/repos/{owner}/{repo}" + try: + async with self.session.get(api_url) as response: + if response.status == 200: + metadata = await response.json() + self.cache.set(cache_key, metadata) + return metadata + else: + logger.error(f"Failed to fetch repo metadata: {response.status}") + return {} + except Exception as e: + logger.error(f"Error fetching repo metadata: {e}") + return {} + + async def get_readme_content(self, owner: str, repo: str, branch: str = 'main') -> Dict: + """Get README content from GitHub with caching.""" + cache_key = f"readme_{owner}_{repo}_{branch}" + cached_content = self.cache.get(cache_key) + if cached_content: + return cached_content + + try: + # Try to get README from API first + await self.rate_limiter.acquire() + api_url = f"https://api.github.com/repos/{owner}/{repo}/readme" + async with self.session.get(api_url) as response: + if response.status == 200: + readme_data = await response.json() + content = { + 'content': readme_data.get('content', ''), + 'encoding': readme_data.get('encoding', 'base64'), + 'url': readme_data.get('html_url', '') + } + self.cache.set(cache_key, content) + return content + + # Fallback to scraping if API fails + readme_url = f"https://github.com/{owner}/{repo}/blob/{branch}/README.md" + html_content = await self.fetch_url(readme_url, use_cache=True) + soup = BeautifulSoup(html_content, 'html.parser') + + # Find the README content + readme_content = soup.find('div', {'class': 'markdown-body'}) + if readme_content: + content = { + 'content': readme_content.get_text(), + 'encoding': 'text', + 'url': readme_url + } + self.cache.set(cache_key, content) + return content + + return {} + except Exception as e: + logger.error(f"Error fetching README: {e}") + return {} + + async def get_file_content(self, owner: str, repo: str, path: str, branch: str = 'main') -> Dict: + """Get content of a specific file from GitHub with caching.""" + cache_key = f"file_{owner}_{repo}_{path}_{branch}" + cached_content = self.cache.get(cache_key) + if cached_content: + return cached_content + + try: + # Try to get file content from API first + await self.rate_limiter.acquire() + api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}" + async with self.session.get(api_url) as response: + if response.status == 200: + file_data = await response.json() + content = { + 'content': file_data.get('content', ''), + 'encoding': file_data.get('encoding', 'base64'), + 'url': file_data.get('html_url', '') + } + self.cache.set(cache_key, content) + return content + + # Fallback to scraping if API fails + file_url = f"https://github.com/{owner}/{repo}/blob/{branch}/{path}" + html_content = await self.fetch_url(file_url, use_cache=True) + soup = BeautifulSoup(html_content, 'html.parser') + + # Find the file content + file_content = soup.find('div', {'class': 'file-content'}) + if file_content: + content = { + 'content': file_content.get_text(), + 'encoding': 'text', + 'url': file_url + } + self.cache.set(cache_key, content) + return content + + return {} + except Exception as e: + logger.error(f"Error fetching file content: {e}") + return {} + + async def get_repo_topics(self, owner: str, repo: str) -> List[str]: + """Get repository topics with caching.""" + cache_key = f"topics_{owner}_{repo}" + cached_topics = self.cache.get(cache_key) + if cached_topics: + return cached_topics + + try: + await self.rate_limiter.acquire() + api_url = f"https://api.github.com/repos/{owner}/{repo}/topics" + async with self.session.get(api_url, headers={'Accept': 'application/vnd.github.mercy-preview+json'}) as response: + if response.status == 200: + data = await response.json() + topics = data.get('names', []) + self.cache.set(cache_key, topics) + return topics + return [] + except Exception as e: + logger.error(f"Error fetching topics: {e}") + return [] + + async def get_contributors(self, owner: str, repo: str) -> List[str]: + """Get repository contributors with caching.""" + cache_key = f"contributors_{owner}_{repo}" + cached_contributors = self.cache.get(cache_key) + if cached_contributors: + return cached_contributors + + try: + await self.rate_limiter.acquire() + api_url = f"https://api.github.com/repos/{owner}/{repo}/contributors" + async with self.session.get(api_url) as response: + if response.status == 200: + contributors = await response.json() + contributor_list = [contributor['login'] for contributor in contributors] + self.cache.set(cache_key, contributor_list) + return contributor_list + return [] + except Exception as e: + logger.error(f"Error fetching contributors: {e}") + return [] + + async def scrape_github_content(self, url: str) -> GitHubContent: + """Main function to scrape GitHub content with caching.""" + cache_key = f"content_{url}" + cached_content = self.cache.get(cache_key) + if cached_content: + return GitHubContent(**cached_content) + + try: + # Parse the GitHub URL + repo_info = self.parse_github_url(url) + + # Get repository metadata + metadata = await self.get_repo_metadata(repo_info['owner'], repo_info['repo']) + + # Get content based on URL type + if not repo_info['path'] or repo_info['path'].lower() == 'readme.md': + content_data = await self.get_readme_content( + repo_info['owner'], + repo_info['repo'], + repo_info['branch'] + ) + else: + content_data = await self.get_file_content( + repo_info['owner'], + repo_info['repo'], + repo_info['path'], + repo_info['branch'] + ) + + # Get additional metadata + topics = await self.get_repo_topics(repo_info['owner'], repo_info['repo']) + contributors = await self.get_contributors(repo_info['owner'], repo_info['repo']) + + # Create GitHubContent object + content = GitHubContent( + title=metadata.get('name', ''), + description=metadata.get('description', ''), + content=content_data.get('content', ''), + language=metadata.get('language', ''), + stars=metadata.get('stargazers_count', 0), + forks=metadata.get('forks_count', 0), + watchers=metadata.get('watchers_count', 0), + last_updated=metadata.get('updated_at', ''), + topics=topics, + contributors=contributors, + readme_url=content_data.get('url', ''), + raw_content_url=metadata.get('html_url', ''), + license=metadata.get('license', {}).get('name', ''), + metadata={ + 'size': metadata.get('size', 0), + 'open_issues': metadata.get('open_issues_count', 0), + 'default_branch': metadata.get('default_branch', 'main'), + 'created_at': metadata.get('created_at', ''), + 'pushed_at': metadata.get('pushed_at', '') + } + ) + + # Cache the complete content + self.cache.set(cache_key, content.dict()) + + return content + + except Exception as e: + logger.error(f"Error scraping GitHub content: {e}") + raise + +async def main(): + """Example usage of the GitHub scraper with rate limiting and caching.""" + scraper = GitHubScraper( + cache_dir=".github_cache", + ttl_hours=24, + calls_per_minute=30 + ) + + async with scraper: + # Example URLs + urls = [ + "https://github.com/owner/repo", + "https://github.com/owner/repo/blob/main/README.md", + "https://github.com/owner/repo/blob/main/src/main.py" + ] + + for url in urls: + try: + content = await scraper.scrape_github_content(url) + print(f"Scraped content from {url}:") + print(json.dumps(content.dict(), indent=2)) + except Exception as e: + print(f"Error scraping {url}: {e}") -def save_to_csv(topic_url,topic_name): - file_name = topic_name + '.csv' - if os.path.exists(file_name): - logger.debug(f"The file {file_name} already exists. Skipping.") - topics_df = topic_repo_details(topic_url) - topics_df.to_csv(file_name,index=None) - logger.info(f"Successfully scraped topic {topic_name}") - - -def check_if_already_written(github_url, file_path='papers_already_written_on.txt'): - """ - Check if a GitHub URL is an exact match in each line of a file. - - Args: - github_url (str): GitHub URL string to check. - file_path (str): Path to the file containing lines to check against. Default is 'papers_already_written_on.txt'. - - Returns: - bool: True if an exact match is found, False otherwise. - """ - try: - with open(file_path, 'r', encoding="utf-8") as file: - # Read each line in the file - for line in file: - # Check for an exact match - if github_url.strip() == line.strip(): - return True - except FileNotFoundError: - print(f"File not found: {file_path}") - except Exception as e: - print(f"An error occurred: {str(e)}") - return False +if __name__ == "__main__": + asyncio.run(main()) diff --git a/lib/blog_sections/faqs_generator_blog.py b/lib/blog_sections/faqs_generator_blog.py deleted file mode 100644 index a0c7b87d..00000000 --- a/lib/blog_sections/faqs_generator_blog.py +++ /dev/null @@ -1,37 +0,0 @@ -import sys - -from .gpt_providers.openai_chat_completion import openai_chatgpt -from .gpt_providers.gemini_pro_text import gemini_text_response - -from loguru import logger -logger.remove() -logger.add(sys.stdout, - colorize=True, - format="{level}|{file}:{line}:{function}| {message}" - ) - - -def generate_blog_faq(blog_article, gpt_providers="openai"): - """ - Given a blog title generate an outline for it - """ - logger.info("Generating blog FAQs.") - prompt = f"""As an expert writer, I will provide you with blog content below. - Your task is to write 5 FAQs based on the given blog content. - Always, write fact based answers. Use emojis where applicable. - You must reply in MARKDOWN format. - blog content: '{blog_article}' """ - - if 'gemini' in gpt_providers: - try: - response = gemini_text_response(prompt) - return response - except Exception as err: - logger.error(f"Failed to get response from gemini: {err}") - elif 'openai' in gpt_providers: - try: - logger.info("Calling OpenAI LLM.") - response = openai_chatgpt(prompt) - return response - except Exception as err: - SystemError(f"Failed to get response from Openai: {err}")