Long doc/pdf, chunking for insights

2024-08-01 22:56:13 +05:30
parent f204219edb
commit bd79fa5974
3 changed files with 100 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# How to Alwrity - Getting Started
+# How to ALwrity - Getting Started
 Alwrity assists content creators and digital marketers in keyword web research, AI website & Social media content generation & AI Copywriting.
 Our toolkit integrates **(OpenAI, Gemini, Anthropic)** AI models for text generation, image creation **(Stability.ai), STT(whisper, AssemblyAI)** and Web or local data analysis, streamlining your content creation pipeline and ensuring high-quality output with minimal effort.
--- a/lib/gpt_providers/text_generation/openai_text_gen.py
+++ b/lib/gpt_providers/text_generation/openai_text_gen.py
@@ -20,7 +20,7 @@ def openai_chatgpt(prompt, model, temperature, max_tokens, top_p, n, fp):
    Args:
        prompt (str): The input text to generate completion for.
-        model (str, optional): Model to be used for the completion. Defaults to "gpt-4-1106-preview".
+        model (str, optional): Model to be used for the completion. Defaults to "gpt-4o".
        temperature (float, optional): Controls randomness. Lower values make responses more deterministic. Defaults to 0.2.
        max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 4096
        top_p (float, optional): Controls diversity. Defaults to 0.9.
--- a/lib/utils/alwrity_utils.py
+++ b/lib/utils/alwrity_utils.py
@@ -10,7 +10,9 @@ import configparser
 from datetime import datetime
 import uuid
 from PIL import Image
-from PyPDF2 import PdfReader
+import PyPDF2
 import openai
 import tiktoken
 from docx import Document
 from loguru import logger
 logger.remove()
@@ -41,6 +43,7 @@ from lib.ai_seo_tools.content_title_generator import ai_title_generator
 from lib.ai_seo_tools.meta_desc_generator import metadesc_generator_main
 from lib.gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image
 from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_planner
 from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
 def record_voice(language="en"):
@@ -92,13 +95,15 @@ def process_input(input_text, uploaded_file):
        return "keywords"
    if uploaded_file is not None:
-        file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type, "filesize": uploaded_file.size}
+        file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type}
        st.write(file_details)
        if uploaded_file.type.startswith("text/"):
            content = uploaded_file.read().decode("utf-8")
            st.text(content)
        elif uploaded_file.type == "application/pdf":
-            st.write("PDF file uploaded. Add your PDF processing logic here.")
+            return "PDF_file"
        elif uploaded_file.type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword"]:
            st.write("Word document uploaded. Add your DOCX processing logic here.")
        elif uploaded_file.type.startswith("image/"):
@@ -174,8 +179,8 @@ def blog_from_keyword():
        if not uploaded_file and not user_input and not audio_input:
            st.error("🤬🤬 Either Enter/Type/Attach, can't read your mind.(yet..)")
            st.stop()
-
+        else:
-        input_type = process_input(user_input, uploaded_file)
+            input_type = process_input(user_input, uploaded_file)
        if input_type == "keywords":
            if user_input and len(user_input.split()) >= 2:
@@ -209,6 +214,94 @@ def blog_from_keyword():
        elif input_type == "image_file":
            blog_from_image(user_input, temp_file_path)
        elif input_type == "PDF_file":
            pdf_reader = PyPDF2.PdfReader(uploaded_file)
            text = ""
            combined_result = ""
            # Create a placeholder for the progress bar
            progress_bar = st.progress(0)
            # Loop through each page with a progress bar
            for page_num, page in enumerate(pdf_reader.pages):
                text += page.extract_text()
                # Replace newlines with spaces
                text = text.replace("\n", " ")
                # Use regex to add a space between words that are combined
                text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)
                results = blog_from_pdf(text)
                # Update the progress bar
                progress_bar.progress((page_num + 1) / len(pdf_reader.pages))
                combined_result += str(results[-1])
            # Clear progress bar at the end
            progress_bar.empty()
            st.markdown(combined_result)
 def blog_from_pdf(pdf_text):
    """ Load in a long PDF and pull the text out. Create a prompt to be used to extract key bits of information.
        Chunk up our document and process each chunk to pull any answers out. Combine them at the end. 
        This simple approach will then be extended to three more difficult questions.
    """
    # FixME: 
    document = '<document>'
    template_prompt=f'''Extract key pieces of information from the given document.
        When you extract a key piece of information, include the closest page number.
        Ex: Extracted Information (Page number)
        \n\nDocument: \"\"\"<document>\"\"\"\n\n'''
    # Initialise tokenizer
    tokenizer = tiktoken.get_encoding("cl100k_base")
    results = []
    chunks = create_chunks(pdf_text, 1000, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    for chunk in text_chunks:
        results.append(extract_chunk(chunk, template_prompt))
    #zipped = list(zip(*groups))
    #zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
    return results
 # Split a text into smaller chunks of size n, preferably ending at the end of a sentence
 def create_chunks(text, n, tokenizer):
    tokens = tokenizer.encode(text)
    """Yield successive n-sized chunks from text."""
    i = 0
    while i < len(tokens):
        # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            # Decode the tokens and check for full stop or newline
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # If no end of sentence found, use n tokens as the chunk size
        if j == i + int(0.5 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j
 def extract_chunk(document, template_prompt):
    """ Chunking for large documents, exceed context window"""
    client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    prompt = template_prompt.replace('<document>', document)
    try:
        response = llm_text_gen(prompt)
        return response
    except Exception as err:
        logger.error(f"Exit: Failed to get response from LLM: {err}")
        exit(1)
 def ai_agents_team():
    # Define options for AI Content Teams