AI Blogen - WIP - 0.0.00000.001

2024-01-13 16:27:28 +05:30
parent b51e9a8c2f
commit fd7053fb4b
17 changed files with 1003 additions and 101 deletions
--- a/lib/arxiv_schlorly_research.py
+++ b/lib/arxiv_schlorly_research.py
@@ -0,0 +1,323 @@
+####################################################
+#
+# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
+# https://github.com/danielnsilva/semanticscholar
+# https://github.com/shauryr/S2QA
+#
+####################################################
+
+
+import os 
+import sys
+import re
+
+import pandas as pd
+import arxiv
+import PyPDF2
+import io
+import requests
+from bs4 import BeautifulSoup
+import urllib.parse
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def fetch_arxiv_data(query, max_results=10):
+    try:
+        # Construct the default API client
+        client = arxiv.Client()
+
+        # Search for articles matching the keyword
+        search = arxiv.Search(
+            query=query,
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.SubmittedDate
+        )
+
+        # Fetching results
+        results = list(client.results(search))
+        # Extracting data
+        all_data = []
+        for result in results:
+            temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
+            all_data.append(temp)
+
+        return all_data
+
+    except Exception as e:
+        print("An error occurred while fetching data from arXiv:", e)
+        raise e
+
+
+def create_dataframe(data, column_names):
+    try:
+        df = pd.DataFrame(data, columns=column_names)
+        return df
+    except Exception as e:
+        print("An error occurred while creating DataFrame:", e)
+        return pd.DataFrame()
+
+
+def get_arxiv_main_content(url):
+    """
+    Returns the main content of an arXiv paper.
+
+    Args:
+        url (str): The URL of the arXiv paper.
+
+    Returns:
+        str: The main content of the paper as a string.
+    """
+
+    try:
+        # Send a GET request to the URL
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+
+        # Parse the HTML content
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        # Find the main content in 'ltx_page_content'
+        main_content = soup.find('div', class_='ltx_page_content')
+        if not main_content:
+            logger.warning("Main content not found in the page.")
+            return "Main content not found."
+
+        # Remove specific section with class 'package-alerts ltx_document'
+        alert_section = main_content.find('div', class_='package-alerts ltx_document')
+        if alert_section:
+            alert_section.decompose()
+
+        # Optional: Remove abstract and authors if present
+        for element_id in ["abs", "authors"]:
+            element = main_content.find(id=element_id)
+            if element:
+                element.decompose()
+        return main_content.text.strip()
+
+    # Could not access the arxiv HTML content, instead download pdf and read its content.
+    except Exception as html_error:
+        logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
+        try:
+            # Extract arXiv ID from URL
+            arxiv_id = url.split('/')[-1]
+            # Fetch paper information using arXiv API
+            paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
+            pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
+            # Initialize an empty string to store the extracted text
+            pdf_text = ''
+
+            # Read the downloaded PDF
+            with open(pdf_filename, 'rb') as f:
+                pdf_reader = PyPDF2.PdfReader(f)
+
+                for page in pdf_reader.pages:
+                    try:
+                        # Attempt to extract text from the current page
+                        page_text = page.extract_text()
+                        # If text extraction is successful, add it to the cumulative text
+                        if page_text:
+                            pdf_text += page_text + '\n'
+                    except UnicodeDecodeError as err:
+                        # FIXME: Handle any UnicodeDecodeError that arises during text extraction
+                        logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
+                        pass
+
+            # Optionally, remove the downloaded PDF file
+            os.remove(pdf_filename)
+            
+            # Pattern to match 'References' and everything that follows
+            pattern = r'References\s*.*'
+            pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
+            sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
+            for section in sections_to_remove:
+                # Pattern to match the section title and any text following it until the next big title or end of document
+                pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
+                pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
+
+            return pdf_text
+
+        except Exception as pdf_error:
+            logger.error(f"Failed to process PDF: {pdf_error}")
+            return "Failed to retrieve content."
+
+
+def download_image(image_url, base_url, folder="images"):
+    # Skip downloading if the image URL is a data URI
+    if image_url.startswith('data:image'):
+        print(f"Skipping download of data URI image: {image_url}")
+        return False
+
+    # Create the folder if it doesn't exist
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+    # Form the absolute URL for image paths
+    if not urllib.parse.urlparse(image_url).scheme:
+        if not base_url.endswith('/'):
+            base_url += '/'
+        image_url = base_url + image_url
+
+    # Download and save the image
+    try:
+        response = requests.get(image_url)
+        response.raise_for_status()
+
+        image_name = image_url.split("/")[-1]
+        with open(os.path.join(folder, image_name), 'wb') as file:
+            file.write(response.content)
+        return True
+
+    except requests.RequestException as e:
+        print(f"Error downloading {image_url}: {str(e)}")
+        return False
+
+
+def scrape_images_from_arxiv(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, 'html.parser')
+        images = soup.find_all('img')
+
+        image_urls = [img['src'] for img in images if 'src' in img.attrs]
+        return image_urls
+
+    except requests.RequestException as e:
+        print(f"Error fetching page {url}: {str(e)}")
+        return []
+
+
+def arxiv_bibtex(arxiv_id):
+    """
+    Get the BibTeX entry for an arXiv paper.
+    Args: 
+        arxiv_id: The arXiv ID of the paper.
+    Returns: 
+        A string containing the BibTeX entry.
+    """
+
+    import urllib.request, xml.dom.minidom
+
+    # Download the XML
+    try:
+        usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
+        xmldoc = xml.dom.minidom.parse(usock)
+        usock.close()
+    except Exception as e:
+        raise e
+
+    # Parse the XML
+    entry = xmldoc.getElementsByTagName("entry")[0]
+    date = entry.getElementsByTagName("updated")[0].firstChild.data
+    text_year = date[:4]
+
+    title = entry.getElementsByTagName("title")[0]
+    text_title = title.firstChild.data.strip()
+
+    authorlist = []
+    first = True
+    for person_name in entry.getElementsByTagName("author"):
+        # Get names
+        name = person_name.getElementsByTagName("name")[0]
+        text_name = name.firstChild.data
+        text_given_name = ' '.join(text_name.split()[:-1])
+        text_surname = text_name.split()[-1]
+        authorlist.append(f"{text_surname}, {text_given_name}")
+        # First author?
+        if first:
+            text_first_author_surname = text_surname
+            first = False
+
+    # Construct the BibTeX entry
+    bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
+    bibtex += f" author = {' and '.join(authorlist)},\n"
+    bibtex += f" title = {{{text_title}}},\n"
+    bibtex += f" year = {{{text_year}}},\n"
+    bibtex += f" eprint = {{{arxiv_id}}},\n"
+    bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
+    bibtex += "}"
+
+    return bibtex
+
+
+#from serpapi import GoogleSearch
+#params = {
+#  "api_key": "os.getenv(SERPER_API_KEY)",
+#  "engine": "google_scholar",
+#  "q": "llm",
+#  "hl": "en",
+#  "as_ylo": "2023",
+#  "as_yhi": "2024"
+#}
+#search = GoogleSearch(params)
+#results = search.get_dict()
+
+
+def extract_arxiv_ids_from_line(line):
+    """
+    Extract the arXiv ID from a given line of text.
+
+    Args:
+    line (str): A line of text potentially containing an arXiv URL.
+
+    Returns:
+    str: The extracted arXiv ID, or None if not found.
+    """
+    arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
+    match = arxiv_id_pattern.search(line)
+    if match:
+        return match.group(1) + (match.group(2) if match.group(2) else '')
+    return None
+
+
+def read_written_ids(file_path):
+    """
+    Read already written arXiv IDs from a file.
+
+    Args:
+    file_path (str): Path to the file containing written IDs.
+
+    Returns:
+    set: A set of arXiv IDs.
+    """
+    written_ids = set()
+    try:
+        with open(file_path, 'r') as file:
+            for line in file:
+                written_ids.add(line.strip())
+    except FileNotFoundError:
+        logger.error(f"File not found: {file_path}")
+    except Exception as e:
+        logger.error(f"Error while reading the file: {e}")
+    return written_ids
+
+
+def append_id_to_file(arxiv_id, output_file_path):
+    """
+    Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
+
+    Args:
+    arxiv_id (str): The arXiv ID to append.
+    output_file_path (str): Path to the output file.
+    """
+    try:
+        # Check if file exists
+        if not os.path.exists(output_file_path):
+            logger.info(f"File does not exist. Creating new file: {output_file_path}")
+            # Create a new file and append the ID
+            with open(output_file_path, 'a') as outfile:
+                outfile.write(arxiv_id + '\n')
+        else:
+            logger.info(f"Appending to existing file: {output_file_path}")
+            # File exists, append the ID
+            with open(output_file_path, 'a') as outfile:
+                outfile.write(arxiv_id + '\n')
+
+    except Exception as e:
+        logger.error(f"Error while appending to file: {e}")
--- a/lib/combine_research_and_blog.py
+++ b/lib/combine_research_and_blog.py
@@ -1,13 +1,23 @@
+import sys
+
 from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gemini_pro_text import gemini_text_response
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )


-def blog_with_research(report, blog):
+def blog_with_research(report, blog, gpt_providers="openai"):
    """Combine the given online research and gpt blog content"""

    prompt = f"""
        You are an expert copywriter specializing in content optimization for SEO.
        I will provide you with a 'research report' and a 'blog content' on the same topic.
-        Your task is to transform and combine the given research and blog content into a well-structured, unique
+        Your task is to transform and combine the given research and blog content into a well-structured markdown, unique
        and engaging blog article.

        Your objectives include:
@@ -27,18 +37,40 @@ def blog_with_research(report, blog):
        11. Ensure Uniqueness: Guarantee the article is plagiarism-free. Write in unique, informative style.
        12. Punctuation: Use appropriate question marks at the end of questions.
        13. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
-        14. REMEMBER: Use the formatting style of given research report and include citations, referances in combined article.
+        14. REMEMBER: Use the formatting style of given research report and include highlights, citations, referances in combined article.

-                Follow these guidelines to combine and write a new, unique, and informative blog article
+        Follow these guidelines to combine and write a new, unique, and informative blog article
        that will rank well in search engine results and engage readers effectively.

-        Create a blog post from the given research report and blog content below.
+        Create a blog post, in markdown, from the given research report and blog content below.
        Research report: {report}
        Blog content: {blog}
        """
-    try:
-        # TBD: Add logic for which_provider and which_model
-        response = openai_chatgpt(prompt)
-        return response
-    except Exception as err:
-        SystemError(f"Error in combining blog and research report.")
+
+    if 'gemini' in gpt_providers:
+        prompt = f"""You are an expert copywriter specializing in content optimization for SEO. 
+        You are world famous writer, known for your originality and engaging content.  
+        I will provide you with a 'research report' and a 'blog content' on the same topic.
+        Your task is to transform and combine the given research and blog content into a blog article.
+        Your blog should be highly detailed and well formatted. 
+        Include a section in your blog on the highlights section of blog content. 
+        Do not miss out any details from provided content. Always, include figures, data, results from given content.
+        It is important that your blog is original and unique. It should be highly readable and SEO optimized.
+
+        Research report: '{report}'
+        Blog content: '{blog}'
+        """
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"Failed to get response from gemini: {err}")
+            raise err
+    elif 'openai' in gpt_providers:
+        try:
+            logger.info("Calling OpenAI LLM.")
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"failed to get response from Openai: {err}")
+            raise err
--- a/lib/convert_content_to_markdown.py
+++ b/lib/convert_content_to_markdown.py
@@ -4,12 +4,13 @@ from .gpt_providers.gemini_pro_text import gemini_text_response

 def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
    """ Helper for converting content to markdown format for static sites. """
-    prompt = f"""
-    As an expert in markdown language format and font matter,
+    
+    prompt = f"""As an expert in markdown language format and font matter,
    I will provide you with a blog post.
-    Your task is to improve formatting of given blog post.
+    Your task is to only Improve the formatting and structure of a blog post to enhance readability, visual appeal, and overall user experience. Do not alter the content of the provided blog. Modify only for the formatting.
+    Dont provide explanations, just your final response.

-    Use below guidelines to do formatting, structuring to make it highly readable:
+    Guidelines to do formatting:
    1. **Headings for Structure:**
   - Use # for the main title of the blog post.
   - Use ## for subheadings that divide the post into clear sections.
@@ -54,8 +55,6 @@ def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
   - Keep the blog post organized and easy to navigate.
   - Use a consistent formatting style throughout the post.
    
-    Dont provide explanations, just your final response.
-    Convert the given blog post in well organised markdown content:\n
    Blog Post: '{blog_content}'"""
    
    if 'openai' in gpt_provider:
@@ -65,6 +64,10 @@ def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
        except Exception as err:
            SystemError(f"Openai Error in converting to Markdown format.")
    elif 'gemini' in gpt_provider:
+
+        prompt = f""" Convert the given blog post into well structured MARKDOWN content. 
+        Do not alter the given blog post.
+        blog post: "{blog_content}" """
        try:
            response = gemini_text_response(prompt)
            return response
--- a/lib/get_blog_metadata.py
+++ b/lib/get_blog_metadata.py
@@ -0,0 +1,26 @@
+import sys
+
+from .get_blog_meta_desc import generate_blog_description
+from .get_tags import get_blog_tags
+from .get_blog_category import get_blog_categories
+from .get_blog_title import generate_blog_title
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def blog_metadata(blog_content, gpt_providers="openai"):
+    """ Common function to get blog metadata """
+    blog_title = generate_blog_title(blog_content, gpt_providers)
+    blog_meta_desc = generate_blog_description(blog_content, gpt_providers)
+    logger.info(f"The blog meta description is: {blog_meta_desc}\n")
+    blog_tags = get_blog_tags(blog_content, gpt_providers)
+    logger.info(f"Blog tags for generated content: {blog_tags}")
+    blog_categories = get_blog_categories(blog_content, gpt_providers)
+    logger.info(f"Generated blog categories: {blog_categories}\n")
+
+    return(blog_title, blog_meta_desc, blog_tags, blog_categories)
--- a/lib/get_blog_title.py
+++ b/lib/get_blog_title.py
@@ -36,4 +36,4 @@ def generate_blog_title(blog_article, gpt_providers="openai"):
            response = openai_chatgpt(prompt)
            return response
        except Exception as err:
-            SystemError(f"Error in generating blog summary: {err}") 
+            SystemError(f"Failed to get response from Openai: {err}") 
--- a/lib/gpt_providers/gemini_arvix_image_details.py
+++ b/lib/gpt_providers/gemini_arvix_image_details.py
@@ -0,0 +1,79 @@
+"""
+"""
+import os
+import logging
+from pathlib import Path
+
+import google.generativeai as genai
+logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
+from dotenv import load_dotenv
+load_dotenv(Path('../../.env'))
+
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_random_exponential,
+) # for exponential backoff
+
+
+@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
+def gemini_arxiv_img_info(img_path):
+    """ Get image details from arxiv papers. """
+    try:
+        genai.configure(api_key=os.getenv("API_KEY"))
+    except Exception as e:
+        logging.error(f"Could not load gemini API key: {e}")
+        raise e
+
+    # Set up the model
+    generation_config = {
+        "temperature": 0.9,
+        "top_p": 1,
+        "top_k": 1,
+        "max_output_tokens": 1096,
+    }
+
+    safety_settings = [{
+        "category": "HARM_CATEGORY_HARASSMENT",
+        "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+    },
+    {
+        "category": "HARM_CATEGORY_HATE_SPEECH",
+        "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+    },
+    {
+        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+        "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+    },
+    {
+        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+        "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+    },]
+
+    try:
+        model = genai.GenerativeModel(model_name="gemini-pro-vision",
+            generation_config=generation_config,
+            safety_settings=safety_settings)
+    except Exception as e:
+        logging.error(f"Could not create GenerativeModel: {e}")
+        raise e
+
+    # Validate that an image is present
+    if not (img := Path(img_path)).exists():
+        raise FileNotFoundError(f"Could not find image: {img}")
+
+    image_parts = [{
+        "mime_type": "image/png",
+        "data": Path(img_path).read_bytes()
+    },]
+
+    prompt_parts = [
+        "As scholar on evaluating research papers, I will provide you with an image from a research paper. Your task is to explain the image in details so that I can use it in a blog article. Explain the key findings and conclusions from the image. Your description should be in simple terms to explain to a wider audience. Explain key findings from the given image.",
+        image_parts[0],]
+
+    try:
+        response = model.generate_content(prompt_parts)
+        return response.text
+    except Exception as e:
+        logging.error(f"Could not generate gemini content: {e}")
+        raise e
--- a/lib/gpt_providers/gemini_pro_text.py
+++ b/lib/gpt_providers/gemini_pro_text.py
@@ -7,6 +7,7 @@ import google.generativeai as genai
 logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
 from dotenv import load_dotenv
 load_dotenv(Path('../../.env'))
+from .mistral_chat_completion import mistral_text_response

 from tenacity import (
    retry,
@@ -17,7 +18,7 @@ from tenacity import (

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
 def gemini_text_response(prompt):
-    """ Provide a programming blog and get code exmaples."""
+    """ Common functiont to get response from gemini pro Text. """
    genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

    # Set up the model
@@ -25,14 +26,17 @@ def gemini_text_response(prompt):
        "temperature": 1,
        "top_p": 1,
        "top_k": 1,
-        "max_output_tokens": 4096,
+        "max_output_tokens": 6096,
    }

    model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)
    try:
        response = model.generate_content(prompt)
-    except Exception as err:
-        logger.error(f"Failed to get response from Gemini: {err}. Retrying..")
-        gemini_research_report(query)

+    except Exception as err:
+        logger.error(f"Failed to get response from Gemini: {err}. Retrying.")
+        # Try with minstral.
+        print(f"\n\n\n--MINSTRAL--\n\n\n\n")
+        response = mistral_text_response(prompt)
+        return response
    return response.text
--- a/lib/gpt_providers/mistral_chat_completion.py
+++ b/lib/gpt_providers/mistral_chat_completion.py
@@ -0,0 +1,40 @@
+import os
+import logging
+from pathlib import Path
+
+from mistralai.client import MistralClient
+from mistralai.models.chat_completion import ChatMessage
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
+from dotenv import load_dotenv
+load_dotenv(Path('../../.env'))
+
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_random_exponential,
+)  # for exponential backoff
+
+
+@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
+def mistral_text_response(prompt):
+    """ Common function to get text response from minstral. """
+    api_key = os.environ["MISTRAL_API_KEY"]
+    model = "mistral-medium"
+
+    client = MistralClient(api_key=api_key)
+
+    messages = [
+        ChatMessage(role="user", content=prompt)
+    ]
+
+    # No streaming
+    chat_response = client.chat(
+        model=model,
+        messages=messages,
+    )
+    print(chat_response)
+
+    # With streaming
+    for chunk in client.chat_stream(model=model, messages=messages):
+        print(chunk)
--- a/lib/main_arxiv_to_blog.py
+++ b/lib/main_arxiv_to_blog.py
@@ -0,0 +1,209 @@
+import sys
+import os
+import datetime
+
+import tiktoken
+
+from .arxiv_schlorly_research import fetch_arxiv_data, create_dataframe, get_arxiv_main_content
+from .arxiv_schlorly_research import arxiv_bibtex, scrape_images_from_arxiv, download_image
+from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
+from .write_research_review_blog import review_research_paper
+from .combine_research_and_blog import blog_with_research
+from .write_blog_scholar_paper import write_blog_from_paper
+from .gpt_providers.gemini_pro_text import gemini_text_response
+from .generate_image_from_prompt import generate_image
+from .convert_content_to_markdown import convert_tomarkdown_format
+from .get_blog_metadata import blog_metadata
+from .get_code_examples import gemini_get_code_samples
+from .save_blog_to_file import save_blog_to_file
+from .take_url_screenshot import screenshot_api
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def blog_arxiv_keyword(query):
+    """ Write blog on given arxiv paper."""
+    arxiv_id = None
+    arxiv_url = None
+    bibtex = None
+    research_review = None
+    column_names = ['Title', 'Date', 'Id', 'Summary', 'PDF URL']
+    papers = fetch_arxiv_data(query)
+    df = create_dataframe(papers, column_names)
+
+    for paper in papers:
+        # Extracting the arxiv_id
+        arxiv_id = paper[2].split('/')[-1]
+        arxiv_url = "https://browse.arxiv.org/html/" + arxiv_id
+        bibtex = arxiv_bibtex(arxiv_id)
+        logger.info(f"Get research paper text from the url: {arxiv_url}")
+        research_content = get_arxiv_main_content(arxiv_url)
+        
+        num_tokens = num_tokens_from_string(research_content, "cl100k_base")
+        logger.info(f"Number of tokens sent: {num_tokens}")
+        # If the number of tokens is below the threshold, process and print the review
+        if 1000 < num_tokens < 30000:
+            logger.info(f"Writing research review on {paper[0]}")
+            research_review = review_research_paper(research_content)
+            research_review = f"\n{research_review}\n\n" + f"```{bibtex}```"
+            #research_review = research_review + "\n\n\n" + f"{df.to_markdown()}"
+            research_review = convert_tomarkdown_format(research_review, "gemini")
+            break
+        else:
+            # Skip to the next iteration if the condition is not met
+            continue
+
+    logger.info(f"Final scholar article: \n\n{research_review}\n")
+    
+    # TBD: Scrape images from research reports and pass to vision to get conclusions out of it.
+    #image_urls = scrape_images_from_arxiv(arxiv_url)
+    #print("Downloading images found on the page:")
+    #for img_url in image_urls:
+    #    download_image(img_url, arxiv_url)
+    try:
+        blog_postprocessing(arxiv_id, research_review)
+    except Exception as err:
+        logger.error(f"Failed in blog post processing: {err}")
+        sys.exit(1)
+
+    logger.info(f"\n\n ################ Finished writing Blog for : #################### \n")
+
+
+def blog_arxiv_url_list(file_path):
+    """ Write blogs on all the arxiv links given in a file. """
+    extracted_ids = []
+    try:
+        with open(file_path, 'r') as file:
+            for line in file:
+                arxiv_id = extract_arxiv_ids_from_line(line)
+                if arxiv_id:
+                    extracted_ids.append(arxiv_id)
+    except FileNotFoundError:
+        logger.error(f"File not found: {file_path}")
+        raise FileNotFoundError
+    except Exception as e:
+        logger.error(f"Error while reading the file: {e}")
+        raise e
+
+    # Read already written IDs
+    written_ids = read_written_ids('papers_already_written_on.txt')
+
+    # Write blogs on each of arxiv_id from the file.
+    for arxiv_id in extracted_ids:
+        # Check if we have already written on this research_paper. For this, all arxiv ids are written in
+        # a file called 'papers_already_written_on.txt'. If arxiv ID is found in this file, skip writing again.        
+        # YUP, use a DB. KISS for now.
+        written_ids = read_written_ids('papers_already_written_on.txt')
+
+    # Loop through extracted IDs
+    for arxiv_id in extracted_ids:
+        if arxiv_id not in written_ids:
+            # This ID has not been written on yet
+            arxiv_url = "https://browse.arxiv.org/html/" + arxiv_id
+            logger.info(f"Get research paper text from the url: {arxiv_url}")
+            research_content = get_arxiv_main_content(arxiv_url)
+            try:
+                num_tokens = num_tokens_from_string(research_content, "cl100k_base")
+            except Exception as err:
+                logger.error(f"Failed in counting tokens: {err}")
+                sys.exit(1)
+            logger.info(f"Number of tokens sent: {num_tokens}")
+            # If the number of tokens is below the threshold, process and print the review
+            # FIXME: Docs over 30k tokens, need to be chunked and summarized.
+            if 1000 < num_tokens < 30000:
+                try:
+                    logger.info(f"Getting bibtex for arxiv ID: {arxiv_id}")
+                    bibtex = arxiv_bibtex(arxiv_id)
+                except Exception as err:
+                    logger.error(f"Failed to get Bibtex: {err}")
+
+                try:
+                    logger.info(f"Writing a research review..")
+                    research_review = review_research_paper(research_content, "gemini")
+                    logger.info(f"Research Review: \n{research_review}\n\n")
+                except Exception as err:
+                    logger.error(f"Failed to write review on research paper: {arxiv_id}{err}")
+
+                research_blog = write_blog_from_paper(research_content, "gemini")
+                logger.info(f"\n\nResearch Blog: {research_blog}\n\n")
+                research_blog = f"\n{research_review}\n\n" + f"```\n{bibtex}\n```"
+                #research_review = blog_with_research(research_review, research_blog, "gemini")
+                #logger.info(f"\n\n\nBLOG_WITH_RESEARCh: {research_review}\n\n\n")
+                research_review = convert_tomarkdown_format(research_review, "gemini")
+                research_review = f"\n{research_review}\n\n" + f"```{bibtex}```"
+                logger.info(f"Final blog from research paper: \n\n{research_review}\n\n\n")
+
+                try:
+                    blog_postprocessing(arxiv_id, research_review)
+                except Exception as err:
+                    logger.error(f"Failed in blog post processing: {err}")
+                    sys.exit(1)
+
+                logger.info(f"\n\n ################ Finished writing Blog for : #################### \n")
+            else:
+                # Skip to the next iteration if the condition is not met
+                logger.error("FIXME: Docs over 30k tokens, need to be chunked and summarized.")
+                continue
+        else:
+            logger.warning(f"Already written, skip writing on Arxiv paper ID: {arxiv_id}")
+
+
+def blog_postprocessing(arxiv_id, research_review):
+    """ Common function to do blog postprocessing. """
+    try:
+        append_id_to_file(arxiv_id, "papers_already_written_on.txt")
+    except Exception as err:
+        logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
+        raise err
+
+    try:
+        blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(research_review, "gemini")
+    except Exception as err:
+        logger.error(f"Failed to get blog metadata: {err}")
+        raise err
+
+    try:
+        arxiv_url_scrnsht = f"https://arxiv.org/abs/{arxiv_id}"
+        generated_image_filepath = take_paper_screenshot(arxiv_url_scrnsht)
+    except Exception as err:
+        logger.error(f"Failed to tsk paper screenshot: {err}")
+        raise err
+
+    try:
+        save_blog_to_file(research_review, blog_title, blog_meta_desc, blog_tags,\
+                blog_categories, generated_image_filepath)
+    except Exception as err:
+        logger.__repr__ror(f"Failed to save blog to a file: {err}")
+        raise err
+
+
+def take_paper_screenshot(arxiv_url):
+    """ Common function to take paper screenshot. """
+    # fixme: Remove the hardcoding, need add another option OR in config ?
+    image_dir = os.path.join(os.getcwd(), "blog_images")
+    generated_image_name = f"generated_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
+    generated_image_filepath = os.path.join(image_dir, generated_image_name)
+    
+    if arxiv_url:
+        try:
+            generated_image_filepath = screenshot_api(arxiv_url, generated_image_filepath)
+        except Exception as err:
+            logger.error(f"Failed in taking url screenshot: {err}")
+
+    return generated_image_filepath
+
+
+def num_tokens_from_string(string, encoding_name):
+    """Returns the number of tokens in a text string."""
+    try:
+        encoding = tiktoken.get_encoding(encoding_name)
+        num_tokens = len(encoding.encode(string))
+        return num_tokens
+    except Exception as err:
+        logger.error(f"Failed to count tokens: {err}")
+        sys.exit(1)
--- a/lib/save_blog_to_file.py
+++ b/lib/save_blog_to_file.py
@@ -2,6 +2,8 @@ import sys
 import os
 import re
 import datetime
+import random
+from dateutil.relativedelta import relativedelta
 from textwrap import dedent
 import logging
 from zoneinfo import ZoneInfo
@@ -22,6 +24,18 @@ output_path = "blogs"
 output_path = os.path.join(os.getcwd(), output_path)


+def random_date_last_three_months():
+    current_date = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
+    three_months_ago = current_date - relativedelta(months=3)
+
+    # Generate a random date between three_months_ago and current_date
+    random_date = three_months_ago + datetime.timedelta(
+        seconds=random.randint(0, int((current_date - three_months_ago).total_seconds()))
+    )
+
+    return random_date.strftime('%Y-%m-%d %H:%M:%S %z')
+
+
 def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_categories, main_img_path=None, file_type="md"):
    """
    Saves the provided blog content to a file in the specified format.
@@ -60,9 +74,11 @@ def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_
    # Handle Markdown file type
    if file_type == "md":
        logger.info("Writing/Saving the resultant blog content in Markdown format.")
-        dtobj = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
-        formatted_date = dtobj.strftime('%Y-%m-%d %H:%M:%S %z')
-        blog_title = blog_title.replace(":", "-").replace('"', '')
+        # Hmmmm, bulk generation will benefit from randomizing publishing dates.
+        #dtobj = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
+        #formatted_date = dtobj.strftime('%Y-%m-%d %H:%M:%S %z')
+        formatted_date = random_date_last_three_months()
+        blog_title = blog_title.replace(":", "-").replace('"', '').replace('**', '')
        if main_img_path:
            blog_frontmatter = dedent(f"""\
                ---
@@ -70,7 +86,7 @@ def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_
                date: {formatted_date}
                categories: [{blog_categories}]
                tags: [{blog_tags}]
-                description: {blog_meta_desc.replace(":", "-")}
+                description: {blog_meta_desc.replace(":", "-").replace('**', '')}
                img_path: '/assets/'
                image:
                    path: {os.path.basename(main_img_path)}
--- a/lib/take_url_screenshot.py
+++ b/lib/take_url_screenshot.py
@@ -44,27 +44,15 @@ def screenshot_api(url, generated_image_filepath):

    return generated_image_filepath

-
-def take_screenshot(url, generated_image_filepath, full_screenshot):
+def take_screenshot(url, generated_image_filepath):
    # Create a webdriver instance
    driver = webdriver.Chrome()

    # Navigate to the given url
    driver.get(url)

-    # Get the height of the webpage
-    page_height = driver.execute_script("return document.body.scrollHeight")
-
-    # Scroll down to the bottom of the webpage
-    for i in range(0, page_height, 100):
-        driver.execute_script(f"window.scrollTo(0, {i})")
-
-    # Get the total height of the webpage
-    total_height = driver.execute_script("return document.body.scrollHeight")
-
-    # Resize the webdriver window to the height of the webpage
-    if full_screenshot:
-        driver.set_window_size(800, total_height)
+    # Set a fixed window size (you can adjust this as needed)
+    driver.set_window_size(800, 600)

    # Take a screenshot of the webpage
    screenshot = driver.get_screenshot_as_png()
--- a/lib/write_blog_scholar_paper.py
+++ b/lib/write_blog_scholar_paper.py
@@ -0,0 +1,49 @@
+import sys
+
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gemini_pro_text import gemini_text_response
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def write_blog_from_paper(paper_content, gpt_providers="openai"):
+    """ Write blog from given paper url. """
+    prompt = f"""As an expert in NLP and AI, I will provide you with a content of a research paper. 
+    Your task is to write a highly detailed blog(at least 2000 words), breaking down complex concepts for beginners.
+    Take your time and do not rush to respond.
+    Do not provide explanations, suggestions in your response.
+
+    Include the below section in your blog:
+    Highlights: Include a list of 5 most important and unique claims of the given research paper.
+    Abstract: Start by reading the abstract, which provides a concise summary of the research, including its purpose, methodology, and key findings.
+    Introduction: This section will give you background information and set the context for the research. It often ends with a statement of the research question or hypothesis.
+    Methodology: Include description of how authors conducted the research. This can include data sources, experimental setup, analytical techniques, etc.
+    Results: This section presents the data or findings of the research. Pay attention to figures, tables, and any statistical analysis provided.
+    Discussion/Analysis: In this section, Explain how research paper answers the research questions or how they fit with existing knowledge.
+    Conclusion: This part summarizes the main findings and their implications. It might also suggest areas for further research.
+    References: The cited works can provide additional context or background reading.
+    Remember, Please use MLA format and markdown syntax.
+    Do not provide description, explanations for your response.
+    Take your time in crafting your blog content, do not rush to give the response.
+    Using the blog structure above, please write a detailed and original blog on given research paper: \n'{paper_content}'\n\n"""
+
+    if 'gemini' in gpt_providers:
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"Failed to get response from gemini: {err}")
+            raise err
+    elif 'openai' in gpt_providers:
+        try:
+            logger.info("Calling OpenAI LLM.")
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"failed to get response from Openai: {err}")
+            raise err
--- a/lib/write_research_review_blog.py
+++ b/lib/write_research_review_blog.py
@@ -0,0 +1,89 @@
+import sys
+
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gemini_pro_text import gemini_text_response
+from .gpt_providers.mistral_chat_completion import mistral_text_response
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def review_research_paper(research_blog, gpt_providers="openai"):
+    """ """
+    prompt = f"""As world's top researcher and academician, I will provide you with research paper.
+    Your task is to write a highly detailed review report. 
+    Important, your report should be factual, original and demostrate your expertise.
+
+    Review guidelines:
+    1). Read the Abstract and Introduction Carefully:
+        Begin by thoroughly reading the abstract and introduction of the paper.
+        Try to understand the research question, the objectives, and the background information.
+        Identify the central argument or hypothesis that the study is examining.
+
+    2). Examine the Methodology and Methods:
+        Read closely at the research design, whether it is experimental, observational, qualitative, or a combination of methods.
+        Check the sampling strategy and the size of the sample.
+        Review the methods of data collection and the instruments used for this purpose.
+        Think about any ethical issues and possible biases in the study.
+
+    3). Analyze the Results and Discussion:
+        Review how the results are presented, including any tables, graphs, and statistical analysis.
+        Evaluate the findings' validity and reliability.
+        Analyze whether the results support or contradict the research question and hypothesis.
+        Read the discussion section where the authors interpret their findings and their significance.
+
+    4). Consider the Limitations and Strengths:
+        Spot any limitations or potential weaknesses in the study.
+        Evaluate the strengths and contributions that the research makes.
+        Think about how generalizable the findings are to other populations or situations.
+
+    5). Assess the Writing and Organization:
+        Judge the clarity and structure of the report.
+        Consider the use of language, grammar, and the overall formatting.
+        Assess how well the arguments are logically organized and how coherent the report is.
+
+    6). Evaluate the Literature Review:
+        Examine how comprehensive and relevant the literature review is.
+        Consider how the study adds to or builds upon existing research.
+        Evaluate the timeliness and quality of the sources cited in the research.
+
+    7). Review the Conclusion and Implications:
+        Look at the conclusions drawn from the study and how well they align with the findings.
+        Think about the practical implications and potential applications of the research.
+        Evaluate the suggestions for further research or policy actions.
+
+    8). Overall Assessment:
+        Formulate an overall opinion about the research report's quality and thoroughness.
+        Consider the significance and impact of the findings.
+        Evaluate how the study contributes to its field of research.
+
+    9). Provide Constructive Feedback:
+        Offer constructive criticism and suggestions for improvement, where necessary.
+        Think about possible biases or alternative ways to interpret the findings.
+        Suggest ideas for future research or for replicating the study.
+
+    Do not provide description, explanations for your response.
+    Using the above review guidelines, write a detailed review report on the below research paper.
+    Research Paper: '{research_blog}'
+    """
+
+    if 'gemini' in gpt_providers:
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"Failed to get response from gemini: {err}")
+            response = mistral_text_response(prompt)
+            return response
+
+    elif 'openai' in gpt_providers:
+        try:
+            logger.info("Calling OpenAI LLM.")
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Failed to get response from Openai: {err}")