Long doc/pdf, chunking for insights
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
# How to Alwrity - Getting Started
|
# How to ALwrity - Getting Started
|
||||||
|
|
||||||
Alwrity assists content creators and digital marketers in keyword web research, AI website & Social media content generation & AI Copywriting.
|
Alwrity assists content creators and digital marketers in keyword web research, AI website & Social media content generation & AI Copywriting.
|
||||||
Our toolkit integrates **(OpenAI, Gemini, Anthropic)** AI models for text generation, image creation **(Stability.ai), STT(whisper, AssemblyAI)** and Web or local data analysis, streamlining your content creation pipeline and ensuring high-quality output with minimal effort.
|
Our toolkit integrates **(OpenAI, Gemini, Anthropic)** AI models for text generation, image creation **(Stability.ai), STT(whisper, AssemblyAI)** and Web or local data analysis, streamlining your content creation pipeline and ensuring high-quality output with minimal effort.
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def openai_chatgpt(prompt, model, temperature, max_tokens, top_p, n, fp):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
prompt (str): The input text to generate completion for.
|
prompt (str): The input text to generate completion for.
|
||||||
model (str, optional): Model to be used for the completion. Defaults to "gpt-4-1106-preview".
|
model (str, optional): Model to be used for the completion. Defaults to "gpt-4o".
|
||||||
temperature (float, optional): Controls randomness. Lower values make responses more deterministic. Defaults to 0.2.
|
temperature (float, optional): Controls randomness. Lower values make responses more deterministic. Defaults to 0.2.
|
||||||
max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 4096
|
max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 4096
|
||||||
top_p (float, optional): Controls diversity. Defaults to 0.9.
|
top_p (float, optional): Controls diversity. Defaults to 0.9.
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ import configparser
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import uuid
|
import uuid
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from PyPDF2 import PdfReader
|
import PyPDF2
|
||||||
|
import openai
|
||||||
|
import tiktoken
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
logger.remove()
|
logger.remove()
|
||||||
@@ -41,6 +43,7 @@ from lib.ai_seo_tools.content_title_generator import ai_title_generator
|
|||||||
from lib.ai_seo_tools.meta_desc_generator import metadesc_generator_main
|
from lib.ai_seo_tools.meta_desc_generator import metadesc_generator_main
|
||||||
from lib.gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image
|
from lib.gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image
|
||||||
from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_planner
|
from lib.content_planning_calender.content_planning_agents_alwrity_crew import ai_agents_planner
|
||||||
|
from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
|
||||||
|
|
||||||
|
|
||||||
def record_voice(language="en"):
|
def record_voice(language="en"):
|
||||||
@@ -92,13 +95,15 @@ def process_input(input_text, uploaded_file):
|
|||||||
return "keywords"
|
return "keywords"
|
||||||
|
|
||||||
if uploaded_file is not None:
|
if uploaded_file is not None:
|
||||||
file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type, "filesize": uploaded_file.size}
|
file_details = {"filename": uploaded_file.name, "filetype": uploaded_file.type}
|
||||||
st.write(file_details)
|
st.write(file_details)
|
||||||
if uploaded_file.type.startswith("text/"):
|
if uploaded_file.type.startswith("text/"):
|
||||||
content = uploaded_file.read().decode("utf-8")
|
content = uploaded_file.read().decode("utf-8")
|
||||||
st.text(content)
|
st.text(content)
|
||||||
|
|
||||||
elif uploaded_file.type == "application/pdf":
|
elif uploaded_file.type == "application/pdf":
|
||||||
st.write("PDF file uploaded. Add your PDF processing logic here.")
|
return "PDF_file"
|
||||||
|
|
||||||
elif uploaded_file.type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword"]:
|
elif uploaded_file.type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword"]:
|
||||||
st.write("Word document uploaded. Add your DOCX processing logic here.")
|
st.write("Word document uploaded. Add your DOCX processing logic here.")
|
||||||
elif uploaded_file.type.startswith("image/"):
|
elif uploaded_file.type.startswith("image/"):
|
||||||
@@ -174,8 +179,8 @@ def blog_from_keyword():
|
|||||||
if not uploaded_file and not user_input and not audio_input:
|
if not uploaded_file and not user_input and not audio_input:
|
||||||
st.error("🤬🤬 Either Enter/Type/Attach, can't read your mind.(yet..)")
|
st.error("🤬🤬 Either Enter/Type/Attach, can't read your mind.(yet..)")
|
||||||
st.stop()
|
st.stop()
|
||||||
|
else:
|
||||||
input_type = process_input(user_input, uploaded_file)
|
input_type = process_input(user_input, uploaded_file)
|
||||||
|
|
||||||
if input_type == "keywords":
|
if input_type == "keywords":
|
||||||
if user_input and len(user_input.split()) >= 2:
|
if user_input and len(user_input.split()) >= 2:
|
||||||
@@ -209,6 +214,94 @@ def blog_from_keyword():
|
|||||||
elif input_type == "image_file":
|
elif input_type == "image_file":
|
||||||
blog_from_image(user_input, temp_file_path)
|
blog_from_image(user_input, temp_file_path)
|
||||||
|
|
||||||
|
elif input_type == "PDF_file":
|
||||||
|
pdf_reader = PyPDF2.PdfReader(uploaded_file)
|
||||||
|
text = ""
|
||||||
|
combined_result = ""
|
||||||
|
# Create a placeholder for the progress bar
|
||||||
|
progress_bar = st.progress(0)
|
||||||
|
|
||||||
|
# Loop through each page with a progress bar
|
||||||
|
for page_num, page in enumerate(pdf_reader.pages):
|
||||||
|
text += page.extract_text()
|
||||||
|
# Replace newlines with spaces
|
||||||
|
text = text.replace("\n", " ")
|
||||||
|
# Use regex to add a space between words that are combined
|
||||||
|
text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)
|
||||||
|
|
||||||
|
results = blog_from_pdf(text)
|
||||||
|
# Update the progress bar
|
||||||
|
progress_bar.progress((page_num + 1) / len(pdf_reader.pages))
|
||||||
|
combined_result += str(results[-1])
|
||||||
|
|
||||||
|
# Clear progress bar at the end
|
||||||
|
progress_bar.empty()
|
||||||
|
|
||||||
|
st.markdown(combined_result)
|
||||||
|
|
||||||
|
|
||||||
|
def blog_from_pdf(pdf_text):
|
||||||
|
""" Load in a long PDF and pull the text out. Create a prompt to be used to extract key bits of information.
|
||||||
|
Chunk up our document and process each chunk to pull any answers out. Combine them at the end.
|
||||||
|
This simple approach will then be extended to three more difficult questions.
|
||||||
|
"""
|
||||||
|
# FixME:
|
||||||
|
document = '<document>'
|
||||||
|
template_prompt=f'''Extract key pieces of information from the given document.
|
||||||
|
|
||||||
|
When you extract a key piece of information, include the closest page number.
|
||||||
|
Ex: Extracted Information (Page number)
|
||||||
|
\n\nDocument: \"\"\"<document>\"\"\"\n\n'''
|
||||||
|
|
||||||
|
# Initialise tokenizer
|
||||||
|
tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
chunks = create_chunks(pdf_text, 1000, tokenizer)
|
||||||
|
text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
|
||||||
|
|
||||||
|
for chunk in text_chunks:
|
||||||
|
results.append(extract_chunk(chunk, template_prompt))
|
||||||
|
|
||||||
|
#zipped = list(zip(*groups))
|
||||||
|
#zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
|
||||||
|
def create_chunks(text, n, tokenizer):
|
||||||
|
tokens = tokenizer.encode(text)
|
||||||
|
"""Yield successive n-sized chunks from text."""
|
||||||
|
i = 0
|
||||||
|
while i < len(tokens):
|
||||||
|
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
|
||||||
|
j = min(i + int(1.5 * n), len(tokens))
|
||||||
|
while j > i + int(0.5 * n):
|
||||||
|
# Decode the tokens and check for full stop or newline
|
||||||
|
chunk = tokenizer.decode(tokens[i:j])
|
||||||
|
if chunk.endswith(".") or chunk.endswith("\n"):
|
||||||
|
break
|
||||||
|
j -= 1
|
||||||
|
# If no end of sentence found, use n tokens as the chunk size
|
||||||
|
if j == i + int(0.5 * n):
|
||||||
|
j = min(i + n, len(tokens))
|
||||||
|
yield tokens[i:j]
|
||||||
|
i = j
|
||||||
|
|
||||||
|
|
||||||
|
def extract_chunk(document, template_prompt):
|
||||||
|
""" Chunking for large documents, exceed context window"""
|
||||||
|
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
||||||
|
prompt = template_prompt.replace('<document>', document)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = llm_text_gen(prompt)
|
||||||
|
return response
|
||||||
|
except Exception as err:
|
||||||
|
logger.error(f"Exit: Failed to get response from LLM: {err}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def ai_agents_team():
|
def ai_agents_team():
|
||||||
# Define options for AI Content Teams
|
# Define options for AI Content Teams
|
||||||
|
|||||||
Reference in New Issue
Block a user