From bd79fa5974388b49c9b1b8fe529364613b17309f Mon Sep 17 00:00:00 2001 From: ajaysi Date: Thu, 1 Aug 2024 22:56:13 +0530 Subject: [PATCH] Long doc/pdf, chunking for insights --- README.md | 2 +- .../text_generation/openai_text_gen.py | 2 +- lib/utils/alwrity_utils.py | 103 +++++++++++++++++- 3 files changed, 100 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index b0d0b2d4..3f8c34f8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# How to Alwrity - Getting Started +# How to ALwrity - Getting Started Alwrity assists content creators and digital marketers in keyword web research, AI website & Social media content generation & AI Copywriting. Our toolkit integrates **(OpenAI, Gemini, Anthropic)** AI models for text generation, image creation **(Stability.ai), STT(whisper, AssemblyAI)** and Web or local data analysis, streamlining your content creation pipeline and ensuring high-quality output with minimal effort. diff --git a/lib/gpt_providers/text_generation/openai_text_gen.py b/lib/gpt_providers/text_generation/openai_text_gen.py index fe5b4f58..9b2788fe 100644 --- a/lib/gpt_providers/text_generation/openai_text_gen.py +++ b/lib/gpt_providers/text_generation/openai_text_gen.py @@ -20,7 +20,7 @@ def openai_chatgpt(prompt, model, temperature, max_tokens, top_p, n, fp): Args: prompt (str): The input text to generate completion for. - model (str, optional): Model to be used for the completion. Defaults to "gpt-4-1106-preview". + model (str, optional): Model to be used for the completion. Defaults to "gpt-4o". temperature (float, optional): Controls randomness. Lower values make responses more deterministic. Defaults to 0.2. max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 4096 top_p (float, optional): Controls diversity. Defaults to 0.9. diff --git a/lib/utils/alwrity_utils.py b/lib/utils/alwrity_utils.py index fcb03af1..001bb909 100644 --- a/lib/utils/alwrity_utils.py +++ b/lib/utils/alwrity_utils.py @@ -10,7 +10,9 @@ import configparser from datetime import datetime import uuid from PIL import Image -from PyPDF2 import PdfReader +import PyPDF2 +import openai +import tiktoken from docx import Document from loguru import logger logger.remove() @@ -41,6 +43,7 @@ from lib.ai_seo_tools.content_title_generator import ai_title_generator from lib.ai_seo_tools.meta_desc_generator import metadesc_generator_main from lib.gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_planner +from ..gpt_providers.text_generation.main_text_generation import llm_text_gen def record_voice(language="en"): @@ -92,13 +95,15 @@ def process_input(input_text, uploaded_file): return "keywords" if uploaded_file is not None: - file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type, "filesize": uploaded_file.size} + file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type} st.write(file_details) if uploaded_file.type.startswith("text/"): content = uploaded_file.read().decode("utf-8") st.text(content) + elif uploaded_file.type == "application/pdf": - st.write("PDF file uploaded. Add your PDF processing logic here.") + return "PDF_file" + elif uploaded_file.type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword"]: st.write("Word document uploaded. Add your DOCX processing logic here.") elif uploaded_file.type.startswith("image/"): @@ -174,8 +179,8 @@ def blog_from_keyword(): if not uploaded_file and not user_input and not audio_input: st.error("🤬🤬 Either Enter/Type/Attach, can't read your mind.(yet..)") st.stop() - - input_type = process_input(user_input, uploaded_file) + else: + input_type = process_input(user_input, uploaded_file) if input_type == "keywords": if user_input and len(user_input.split()) >= 2: @@ -209,6 +214,94 @@ def blog_from_keyword(): elif input_type == "image_file": blog_from_image(user_input, temp_file_path) + elif input_type == "PDF_file": + pdf_reader = PyPDF2.PdfReader(uploaded_file) + text = "" + combined_result = "" + # Create a placeholder for the progress bar + progress_bar = st.progress(0) + + # Loop through each page with a progress bar + for page_num, page in enumerate(pdf_reader.pages): + text += page.extract_text() + # Replace newlines with spaces + text = text.replace("\n", " ") + # Use regex to add a space between words that are combined + text = re.sub(r"(\w)([A-Z])", r"\1 \2", text) + + results = blog_from_pdf(text) + # Update the progress bar + progress_bar.progress((page_num + 1) / len(pdf_reader.pages)) + combined_result += str(results[-1]) + + # Clear progress bar at the end + progress_bar.empty() + + st.markdown(combined_result) + + +def blog_from_pdf(pdf_text): + """ Load in a long PDF and pull the text out. Create a prompt to be used to extract key bits of information. + Chunk up our document and process each chunk to pull any answers out. Combine them at the end. + This simple approach will then be extended to three more difficult questions. + """ + # FixME: + document = '' + template_prompt=f'''Extract key pieces of information from the given document. + + When you extract a key piece of information, include the closest page number. + Ex: Extracted Information (Page number) + \n\nDocument: \"\"\"\"\"\"\n\n''' + + # Initialise tokenizer + tokenizer = tiktoken.get_encoding("cl100k_base") + results = [] + + chunks = create_chunks(pdf_text, 1000, tokenizer) + text_chunks = [tokenizer.decode(chunk) for chunk in chunks] + + for chunk in text_chunks: + results.append(extract_chunk(chunk, template_prompt)) + + #zipped = list(zip(*groups)) + #zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x] + return results + + +# Split a text into smaller chunks of size n, preferably ending at the end of a sentence +def create_chunks(text, n, tokenizer): + tokens = tokenizer.encode(text) + """Yield successive n-sized chunks from text.""" + i = 0 + while i < len(tokens): + # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens + j = min(i + int(1.5 * n), len(tokens)) + while j > i + int(0.5 * n): + # Decode the tokens and check for full stop or newline + chunk = tokenizer.decode(tokens[i:j]) + if chunk.endswith(".") or chunk.endswith("\n"): + break + j -= 1 + # If no end of sentence found, use n tokens as the chunk size + if j == i + int(0.5 * n): + j = min(i + n, len(tokens)) + yield tokens[i:j] + i = j + + +def extract_chunk(document, template_prompt): + """ Chunking for large documents, exceed context window""" + client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + prompt = template_prompt.replace('', document) + + try: + response = llm_text_gen(prompt) + return response + except Exception as err: + logger.error(f"Exit: Failed to get response from LLM: {err}") + exit(1) + + def ai_agents_team(): # Define options for AI Content Teams