Base code

2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions
--- a/ToBeMigrated/ai_writers/scholar_blogs/main_arxiv_to_blog.py
+++ b/ToBeMigrated/ai_writers/scholar_blogs/main_arxiv_to_blog.py
@@ -0,0 +1,202 @@
+import sys
+import os
+import datetime
+
+import tiktoken
+
+from .arxiv_schlorly_research import fetch_arxiv_data, create_dataframe, get_arxiv_main_content
+from .arxiv_schlorly_research import arxiv_bibtex, scrape_images_from_arxiv, download_image
+from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
+from .write_research_review_blog import review_research_paper
+from .combine_research_and_blog import blog_with_research
+from .write_blog_scholar_paper import write_blog_from_paper
+from .gpt_providers.gemini_pro_text import gemini_text_response
+from .generate_image_from_prompt import generate_image
+from .convert_content_to_markdown import convert_tomarkdown_format
+from .get_blog_metadata import blog_metadata
+from .get_code_examples import gemini_get_code_samples
+from .save_blog_to_file import save_blog_to_file
+from .take_url_screenshot import screenshot_api
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def blog_arxiv_keyword(query):
+    """ Write blog on given arxiv paper."""
+    arxiv_id = None
+    arxiv_url = None
+    bibtex = None
+    research_review = None
+    column_names = ['Title', 'Date', 'Id', 'Summary', 'PDF URL']
+    papers = fetch_arxiv_data(query)
+    df = create_dataframe(papers, column_names)
+
+    for paper in papers:
+        # Extracting the arxiv_id
+        arxiv_id = paper[2].split('/')[-1]
+        arxiv_url = "https://browse.arxiv.org/html/" + arxiv_id
+        bibtex = arxiv_bibtex(arxiv_id)
+        logger.info(f"Get research paper text from the url: {arxiv_url}")
+        research_content = get_arxiv_main_content(arxiv_url)
+        
+        num_tokens = num_tokens_from_string(research_content, "cl100k_base")
+        logger.info(f"Number of tokens sent: {num_tokens}")
+        # If the number of tokens is below the threshold, process and print the review
+        if 1000 < num_tokens < 30000:
+            logger.info(f"Writing research review on {paper[0]}")
+            research_review = review_research_paper(research_content)
+            research_review = f"\n{research_review}\n\n" + f"```{bibtex}```"
+            #research_review = research_review + "\n\n\n" + f"{df.to_markdown()}"
+            research_review = convert_tomarkdown_format(research_review, "gemini")
+            break
+        else:
+            # Skip to the next iteration if the condition is not met
+            continue
+
+    logger.info(f"Final scholar article: \n\n{research_review}\n")
+    
+    # TBD: Scrape images from research reports and pass to vision to get conclusions out of it.
+    #image_urls = scrape_images_from_arxiv(arxiv_url)
+    #print("Downloading images found on the page:")
+    #for img_url in image_urls:
+    #    download_image(img_url, arxiv_url)
+    try:
+        blog_postprocessing(arxiv_id, research_review)
+    except Exception as err:
+        logger.error(f"Failed in blog post processing: {err}")
+        sys.exit(1)
+
+    logger.info(f"\n\n ################ Finished writing Blog for : #################### \n")
+
+
+def blog_arxiv_url_list(file_path):
+    """ Write blogs on all the arxiv links given in a file. """
+    extracted_ids = []
+    try:
+        with open(file_path, 'r', encoding="utf-8") as file:
+            for line in file:
+                arxiv_id = extract_arxiv_ids_from_line(line)
+                if arxiv_id:
+                    extracted_ids.append(arxiv_id)
+    except FileNotFoundError:
+        logger.error(f"File not found: {file_path}")
+        raise FileNotFoundError
+    except Exception as e:
+        logger.error(f"Error while reading the file: {e}")
+        raise e
+
+    # Read already written IDs
+    written_ids = read_written_ids('papers_already_written_on.txt')
+
+    # Loop through extracted IDs
+    for arxiv_id in extracted_ids:
+        if arxiv_id not in written_ids:
+            # This ID has not been written on yet
+            arxiv_url = "https://browse.arxiv.org/html/" + arxiv_id
+            logger.info(f"Get research paper text from the url: {arxiv_url}")
+            research_content = get_arxiv_main_content(arxiv_url)
+            try:
+                num_tokens = num_tokens_from_string(research_content, "cl100k_base")
+            except Exception as err:
+                logger.error(f"Failed in counting tokens: {err}")
+                sys.exit(1)
+            logger.info(f"Number of tokens sent: {num_tokens}")
+            # If the number of tokens is below the threshold, process and print the review
+            # FIXME: Docs over 30k tokens, need to be chunked and summarized.
+            if 1000 < num_tokens < 30000:
+                try:
+                    logger.info(f"Getting bibtex for arxiv ID: {arxiv_id}")
+                    bibtex = arxiv_bibtex(arxiv_id)
+                except Exception as err:
+                    logger.error(f"Failed to get Bibtex: {err}")
+
+                try:
+                    logger.info(f"Writing a research review..")
+                    research_review = review_research_paper(research_content, "gemini")
+                    logger.info(f"Research Review: \n{research_review}\n\n")
+                except Exception as err:
+                    logger.error(f"Failed to write review on research paper: {arxiv_id}{err}")
+
+                research_blog = write_blog_from_paper(research_content, "gemini")
+                logger.info(f"\n\nResearch Blog: {research_blog}\n\n")
+                research_blog = f"\n{research_review}\n\n" + f"```\n{bibtex}\n```"
+                #research_review = blog_with_research(research_review, research_blog, "gemini")
+                #logger.info(f"\n\n\nBLOG_WITH_RESEARCh: {research_review}\n\n\n")
+                research_review = convert_tomarkdown_format(research_review, "gemini")
+                research_review = f"\n{research_review}\n\n" + f"```{bibtex}```"
+                logger.info(f"Final blog from research paper: \n\n{research_review}\n\n\n")
+
+                try:
+                    blog_postprocessing(arxiv_id, research_review)
+                except Exception as err:
+                    logger.error(f"Failed in blog post processing: {err}")
+                    sys.exit(1)
+
+                logger.info(f"\n\n ################ Finished writing Blog for : #################### \n")
+            else:
+                # Skip to the next iteration if the condition is not met
+                logger.error("FIXME: Docs over 30k tokens, need to be chunked and summarized.")
+                continue
+        else:
+            logger.warning(f"Already written, skip writing on Arxiv paper ID: {arxiv_id}")
+
+
+def blog_postprocessing(arxiv_id, research_review):
+    """ Common function to do blog postprocessing. """
+    try:
+        append_id_to_file(arxiv_id, "papers_already_written_on.txt")
+    except Exception as err:
+        logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
+        raise err
+
+    try:
+        blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(research_review)
+    except Exception as err:
+        logger.error(f"Failed to get blog metadata: {err}")
+        raise err
+
+    try:
+        arxiv_url_scrnsht = f"https://arxiv.org/abs/{arxiv_id}"
+        generated_image_filepath = take_paper_screenshot(arxiv_url_scrnsht)
+    except Exception as err:
+        logger.error(f"Failed to tsk paper screenshot: {err}")
+        raise err
+
+    try:
+        save_blog_to_file(research_review, blog_title, blog_meta_desc, blog_tags,\
+                blog_categories, generated_image_filepath)
+    except Exception as err:
+        logger.error(f"Failed to save blog to a file: {err}")
+        sys.exit(1)
+
+
+def take_paper_screenshot(arxiv_url):
+    """ Common function to take paper screenshot. """
+    # fixme: Remove the hardcoding, need add another option OR in config ?
+    image_dir = os.path.join(os.getcwd(), "blog_images")
+    generated_image_name = f"generated_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
+    generated_image_filepath = os.path.join(image_dir, generated_image_name)
+    
+    if arxiv_url:
+        try:
+            generated_image_filepath = screenshot_api(arxiv_url, generated_image_filepath)
+        except Exception as err:
+            logger.error(f"Failed in taking url screenshot: {err}")
+
+    return generated_image_filepath
+
+
+def num_tokens_from_string(string, encoding_name):
+    """Returns the number of tokens in a text string."""
+    try:
+        encoding = tiktoken.get_encoding(encoding_name)
+        num_tokens = len(encoding.encode(string))
+        return num_tokens
+    except Exception as err:
+        logger.error(f"Failed to count tokens: {err}")
+        sys.exit(1)
--- a/ToBeMigrated/ai_writers/scholar_blogs/write_blog_scholar_paper.py
+++ b/ToBeMigrated/ai_writers/scholar_blogs/write_blog_scholar_paper.py
@@ -0,0 +1,49 @@
+import sys
+
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gemini_pro_text import gemini_text_response
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def write_blog_from_paper(paper_content):
+    """ Write blog from given paper url. """
+    prompt = f"""As an expert in NLP and AI, I will provide you with a content of a research paper. 
+    Your task is to write a highly detailed blog(at least 2000 words), breaking down complex concepts for beginners.
+    Take your time and do not rush to respond.
+    Do not provide explanations, suggestions in your response.
+
+    Include the below section in your blog:
+    Highlights: Include a list of 5 most important and unique claims of the given research paper.
+    Abstract: Start by reading the abstract, which provides a concise summary of the research, including its purpose, methodology, and key findings.
+    Introduction: This section will give you background information and set the context for the research. It often ends with a statement of the research question or hypothesis.
+    Methodology: Include description of how authors conducted the research. This can include data sources, experimental setup, analytical techniques, etc.
+    Results: This section presents the data or findings of the research. Pay attention to figures, tables, and any statistical analysis provided.
+    Discussion/Analysis: In this section, Explain how research paper answers the research questions or how they fit with existing knowledge.
+    Conclusion: This part summarizes the main findings and their implications. It might also suggest areas for further research.
+    References: The cited works can provide additional context or background reading.
+    Remember, Please use MLA format and markdown syntax.
+    Do not provide description, explanations for your response.
+    Take your time in crafting your blog content, do not rush to give the response.
+    Using the blog structure above, please write a detailed and original blog on given research paper: \n'{paper_content}'\n\n"""
+
+    if 'gemini' in gpt_providers:
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"Failed to get response from gemini: {err}")
+            raise err
+    elif 'openai' in gpt_providers:
+        try:
+            logger.info("Calling OpenAI LLM.")
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"failed to get response from Openai: {err}")
+            raise err
--- a/ToBeMigrated/ai_writers/scholar_blogs/write_research_review_blog.py
+++ b/ToBeMigrated/ai_writers/scholar_blogs/write_research_review_blog.py
@@ -0,0 +1,89 @@
+import sys
+
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gemini_pro_text import gemini_text_response
+from .gpt_providers.mistral_chat_completion import mistral_text_response
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def review_research_paper(research_blog):
+    """ """
+    prompt = f"""As world's top researcher and academician, I will provide you with research paper.
+    Your task is to write a highly detailed review report. 
+    Important, your report should be factual, original and demostrate your expertise.
+
+    Review guidelines:
+    1). Read the Abstract and Introduction Carefully:
+        Begin by thoroughly reading the abstract and introduction of the paper.
+        Try to understand the research question, the objectives, and the background information.
+        Identify the central argument or hypothesis that the study is examining.
+
+    2). Examine the Methodology and Methods:
+        Read closely at the research design, whether it is experimental, observational, qualitative, or a combination of methods.
+        Check the sampling strategy and the size of the sample.
+        Review the methods of data collection and the instruments used for this purpose.
+        Think about any ethical issues and possible biases in the study.
+
+    3). Analyze the Results and Discussion:
+        Review how the results are presented, including any tables, graphs, and statistical analysis.
+        Evaluate the findings' validity and reliability.
+        Analyze whether the results support or contradict the research question and hypothesis.
+        Read the discussion section where the authors interpret their findings and their significance.
+
+    4). Consider the Limitations and Strengths:
+        Spot any limitations or potential weaknesses in the study.
+        Evaluate the strengths and contributions that the research makes.
+        Think about how generalizable the findings are to other populations or situations.
+
+    5). Assess the Writing and Organization:
+        Judge the clarity and structure of the report.
+        Consider the use of language, grammar, and the overall formatting.
+        Assess how well the arguments are logically organized and how coherent the report is.
+
+    6). Evaluate the Literature Review:
+        Examine how comprehensive and relevant the literature review is.
+        Consider how the study adds to or builds upon existing research.
+        Evaluate the timeliness and quality of the sources cited in the research.
+
+    7). Review the Conclusion and Implications:
+        Look at the conclusions drawn from the study and how well they align with the findings.
+        Think about the practical implications and potential applications of the research.
+        Evaluate the suggestions for further research or policy actions.
+
+    8). Overall Assessment:
+        Formulate an overall opinion about the research report's quality and thoroughness.
+        Consider the significance and impact of the findings.
+        Evaluate how the study contributes to its field of research.
+
+    9). Provide Constructive Feedback:
+        Offer constructive criticism and suggestions for improvement, where necessary.
+        Think about possible biases or alternative ways to interpret the findings.
+        Suggest ideas for future research or for replicating the study.
+
+    Do not provide description, explanations for your response.
+    Using the above review guidelines, write a detailed review report on the below research paper.
+    Research Paper: '{research_blog}'
+    """
+
+    if 'gemini' in gpt_providers:
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            logger.error(f"Failed to get response from gemini: {err}")
+            response = mistral_text_response(prompt)
+            return response
+
+    elif 'openai' in gpt_providers:
+        try:
+            logger.info("Calling OpenAI LLM.")
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Failed to get response from Openai: {err}")