Long doc/pdf, chunking for insights

This commit is contained in:
ajaysi
2024-08-01 22:56:13 +05:30
parent f204219edb
commit bd79fa5974
3 changed files with 100 additions and 7 deletions

View File

@@ -1,4 +1,4 @@
# How to Alwrity - Getting Started
# How to ALwrity - Getting Started
Alwrity assists content creators and digital marketers in keyword web research, AI website & Social media content generation & AI Copywriting.
Our toolkit integrates **(OpenAI, Gemini, Anthropic)** AI models for text generation, image creation **(Stability.ai), STT(whisper, AssemblyAI)** and Web or local data analysis, streamlining your content creation pipeline and ensuring high-quality output with minimal effort.

View File

@@ -20,7 +20,7 @@ def openai_chatgpt(prompt, model, temperature, max_tokens, top_p, n, fp):
Args:
prompt (str): The input text to generate completion for.
model (str, optional): Model to be used for the completion. Defaults to "gpt-4-1106-preview".
model (str, optional): Model to be used for the completion. Defaults to "gpt-4o".
temperature (float, optional): Controls randomness. Lower values make responses more deterministic. Defaults to 0.2.
max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 4096
top_p (float, optional): Controls diversity. Defaults to 0.9.

View File

@@ -10,7 +10,9 @@ import configparser
from datetime import datetime
import uuid
from PIL import Image
from PyPDF2 import PdfReader
import PyPDF2
import openai
import tiktoken
from docx import Document
from loguru import logger
logger.remove()
@@ -41,6 +43,7 @@ from lib.ai_seo_tools.content_title_generator import ai_title_generator
from lib.ai_seo_tools.meta_desc_generator import metadesc_generator_main
from lib.gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image
from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_planner
from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
def record_voice(language="en"):
@@ -92,13 +95,15 @@ def process_input(input_text, uploaded_file):
return "keywords"
if uploaded_file is not None:
file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type, "filesize": uploaded_file.size}
file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type}
st.write(file_details)
if uploaded_file.type.startswith("text/"):
content = uploaded_file.read().decode("utf-8")
st.text(content)
elif uploaded_file.type == "application/pdf":
st.write("PDF file uploaded. Add your PDF processing logic here.")
return "PDF_file"
elif uploaded_file.type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword"]:
st.write("Word document uploaded. Add your DOCX processing logic here.")
elif uploaded_file.type.startswith("image/"):
@@ -174,8 +179,8 @@ def blog_from_keyword():
if not uploaded_file and not user_input and not audio_input:
st.error("🤬🤬 Either Enter/Type/Attach, can't read your mind.(yet..)")
st.stop()
input_type = process_input(user_input, uploaded_file)
else:
input_type = process_input(user_input, uploaded_file)
if input_type == "keywords":
if user_input and len(user_input.split()) >= 2:
@@ -209,6 +214,94 @@ def blog_from_keyword():
elif input_type == "image_file":
blog_from_image(user_input, temp_file_path)
elif input_type == "PDF_file":
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = ""
combined_result = ""
# Create a placeholder for the progress bar
progress_bar = st.progress(0)
# Loop through each page with a progress bar
for page_num, page in enumerate(pdf_reader.pages):
text += page.extract_text()
# Replace newlines with spaces
text = text.replace("\n", " ")
# Use regex to add a space between words that are combined
text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)
results = blog_from_pdf(text)
# Update the progress bar
progress_bar.progress((page_num + 1) / len(pdf_reader.pages))
combined_result += str(results[-1])
# Clear progress bar at the end
progress_bar.empty()
st.markdown(combined_result)
def blog_from_pdf(pdf_text):
""" Load in a long PDF and pull the text out. Create a prompt to be used to extract key bits of information.
Chunk up our document and process each chunk to pull any answers out. Combine them at the end.
This simple approach will then be extended to three more difficult questions.
"""
# FixME:
document = '<document>'
template_prompt=f'''Extract key pieces of information from the given document.
When you extract a key piece of information, include the closest page number.
Ex: Extracted Information (Page number)
\n\nDocument: \"\"\"<document>\"\"\"\n\n'''
# Initialise tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")
results = []
chunks = create_chunks(pdf_text, 1000, tokenizer)
text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
for chunk in text_chunks:
results.append(extract_chunk(chunk, template_prompt))
#zipped = list(zip(*groups))
#zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
return results
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def create_chunks(text, n, tokenizer):
tokens = tokenizer.encode(text)
"""Yield successive n-sized chunks from text."""
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j
def extract_chunk(document, template_prompt):
""" Chunking for large documents, exceed context window"""
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
prompt = template_prompt.replace('<document>', document)
try:
response = llm_text_gen(prompt)
return response
except Exception as err:
logger.error(f"Exit: Failed to get response from LLM: {err}")
exit(1)
def ai_agents_team():
# Define options for AI Content Teams