Improved longform, Image, prompts

2024-07-01 19:21:28 +05:30
parent 97ece766c9
commit 94b938d31e
4 changed files with 95 additions and 45 deletions
--- a/lib/ai_writers/long_form_ai_writer.py
+++ b/lib/ai_writers/long_form_ai_writer.py
@@ -30,7 +30,7 @@ logger.add(sys.stdout,
 from ..utils.read_main_config_params import read_return_config_section
 from ..ai_web_researcher.gpt_online_researcher import do_metaphor_ai_research
 from ..ai_web_researcher.gpt_online_researcher import do_google_serp_search, do_tavily_ai_search
-from ..blog_metadata.get_blog_metadata import blog_metadata
+from ..blog_metadata.get_blog_metadata import get_blog_metadata_longform
 from ..blog_postprocessing.save_blog_to_file import save_blog_to_file


@@ -132,7 +132,7 @@ def long_form_generator(content_keywords):
        
        genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
        # Initialize the generative model
-        model = genai.GenerativeModel('gemini-1.5-flash', generation_config=generation_config)
+        model_flash = genai.GenerativeModel('gemini-1.5-flash', generation_config=generation_config)
        model_pro = genai.GenerativeModel('gemini-pro', generation_config=generation_config)
        
        # Do SERP web research for given keywords to generate title and outline.
@@ -148,7 +148,7 @@ def long_form_generator(content_keywords):
            return
        
        try:
-            content_outline = generate_with_retry(model_pro, content_outline.format(
+            content_outline = generate_with_retry(model_flash, content_outline.format(
                content_title=content_title, 
                web_research_result=web_research_result)).text
            logger.info(f"The content Outline is: {content_outline}\n\n")
@@ -187,9 +187,9 @@ def long_form_generator(content_keywords):
            logger.info(f"Starting to write on the outline introduction.")
            draft = starting_draft
            continuation = generate_with_retry(model_pro, continuation_prompt.format(
-                    content_title=content_title, 
-                    content_outline=content_outline, 
-                    content_text=draft, 
+                    content_title=content_title,
+                    content_outline=content_outline,
+                    content_text=draft,
                    web_research_result=web_research_result,
                    writing_guidelines=writing_guidelines)).text
        except Exception as err:
@@ -211,7 +211,7 @@ def long_form_generator(content_keywords):
            Content Outline:\n
            '{content_outline}'
            """
-        search_words = generate_with_retry(model, search_terms).text
+        search_words = generate_with_retry(model_flash, search_terms).text
        status.update(label=f"Search terms from written draft: {search_words}")
        
        while 'IAMDONE' not in continuation:
@@ -220,50 +220,48 @@ def long_form_generator(content_keywords):
            # Strip quotes from each element 
            str_list = [s.strip('\'"') for s in str_list]

-            for search_term in str_list:
-                web_research_result, m_titles, t_titles = do_tavily_ai_search(search_term, max_results=5)
-                try:
-                    continuation = generate_with_retry(model_pro, continuation_prompt.format(
-                        content_title=content_title,
-                        content_outline=content_outline, 
-                        content_text=draft, 
-                        web_research_result=web_research_result,
-                        writing_guidelines=writing_guidelines)).text
-    
-                    draft += '\n\n' + continuation
-                    logger.info(f"Writing in progress... Current draft length: {len(draft)} characters")
-                    status.update(label=f"Writing in progress... Current draft length: {len(draft)} characters")
-                    # At this point, the context is little stale. We should more web research on
-                    # related queries as per the content outline, to augment the LLM context.
-                except Exception as err:
-                    st.error(f"Failed to continually write the Essay: {err}")
-                    logger.error(f"Failed to continually write the Essay: {err}")
-                    return
+#            for search_term in str_list:
+#                web_research_result, m_titles, t_titles = do_tavily_ai_search(search_term, max_results=5)
+#                status.update(label=f"Search terms from written draft: {search_term}")
+#                for item in web_research_result.get("results"):
+#                    title = item.get("title", "")
+#                    snippet = item.get("content", "")
+#                    table_data.append([title, snippet])
+#                web_research_result = table_data
+
+            try:
+                continuation = generate_with_retry(model_pro, continuation_prompt.format(
+                            content_title=content_title,
+                            content_outline=content_outline, 
+                            content_text=draft, 
+                            web_research_result=web_research_result,
+                            writing_guidelines=writing_guidelines)).text
+        
+                draft += '\n\n' + continuation
+                logger.info(f"Writing in progress... Current draft length: {len(draft)} characters")
+                status.update(label=f"Writing in progress... Current draft length: {len(draft)} characters")
+                # At this point, the context is little stale. We should more web research on
+                # related queries as per the content outline, to augment the LLM context.
+            except Exception as err:
+                st.error(f"Failed to continually write long-form content: {err}")
+                logger.error(f"Failed to continually write the Essay: {err}")
+                return
        
        # Remove 'IAMDONE' and print the final story
        final = draft.replace('IAMDONE', '').strip()
        status.update(label="Success: Finished writing Long form content.")

-        # FIXME: The current implementation is suited for normal length content.
-        # In long content sending the whole content for each content metadata is expensive.
-#        blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(final,
-#                content_keywords, m_titles)
+#        # In long content sending the whole content for each content metadata is expensive.
+#        # https://ai.google.dev/gemini-api/docs/caching?lang=python
+#        #blog_title, blog_meta_desc, blog_tags, blog_categories = get_blog_metadata_longform(final)
+#        blog_categories = get_blog_metadata_longform(final)
+#        print("\n\n-----{blog_categories}------\n\n")
+#
 #        status.update(label="Success: Finished with Title, Meta Description, Tags, categories")
 #        generated_image_filepath = None
 #        # TBD: Save the blog content as a .md file. Markdown or HTML ?
 #        save_blog_to_file(final, blog_title, blog_meta_desc, blog_tags, blog_categories, generated_image_filepath)
-#    
-#        blog_frontmatter = dedent(f"""
-#        \n---------------------------------------------------------------------
-#        title: {blog_title.strip()}\n
-#        categories: [{blog_categories.strip()}]\n
-#        tags: [{blog_tags.strip()}]\n
-#        Meta description: {blog_meta_desc.replace(":", "-").strip()}\n
-#        ---------------------------------------------------------------------\n
-#        """)
-#
-#        logger.info(f"\n{blog_frontmatter}{final}\n\n")
-#        st.markdown(f"\n{blog_frontmatter}{final}\n\n")
+        
        logger.info(f"\n{final}\n\n")

        logger.info(f"\n\n ################ Finished writing Blog for : {content_keywords} #################### \n")
--- a/lib/blog_metadata/get_blog_metadata.py
+++ b/lib/blog_metadata/get_blog_metadata.py
@@ -1,3 +1,6 @@
+import os
+import time
+import datetime
 import sys
 import streamlit as st
 from loguru import logger
@@ -9,9 +12,12 @@ logger.add(sys.stdout,
           colorize=True,
           format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
           )
+import google.generativeai as genai
+from google.generativeai import caching

 from ..gpt_providers.text_generation.main_text_generation import llm_text_gen

+
 async def blog_metadata(blog_article):
    """ Common function to get blog metadata """
    logger.info(f"Generating Content MetaData\n")
@@ -47,6 +53,7 @@ async def blog_metadata(blog_article):

    return blog_title, blog_meta_desc, blog_tags, blog_categories

+
 def generate_blog_title(blog_article):
    """
    Given a blog title generate an outline for it
@@ -124,3 +131,48 @@ def run_async(coro):
    result = loop.run_until_complete(coro)
    loop.close()
    return result
+
+
+def get_blog_metadata_longform(longform_content):
+    """ Function for caching long-form content """
+    # Open the file in write mode ("w") to overwrite existing content.
+    filepath = os.path.join(os.getenv("CONTENT_SAVE_DIR"), "lognform_metadata_file")
+    with open(filepath, "w") as file:
+        # Write the text to the file
+        file.write(longform_content)
+        print(f"String saved successfully to: {filepath}")
+
+    genai.configure(api_key=os.environ['GEMINI_API_KEY'])
+    file_path = genai.upload_file(path=filepath)
+    
+    # Wait for the file to finish processing
+    while file_path.state.name == 'PROCESSING':
+        print('Waiting for video to be processed.')
+        time.sleep(2)
+        file_path = genai.get_file(video_file.name)
+
+    print(f'Video processing complete: {file_path.uri}')
+
+    # Create a cache with a 5 minute TTL
+    cache = caching.CachedContent.create(
+        model='models/gemini-1.5-flash-001',
+        display_name='Alwrity Longform content', # used to identify the cache
+        system_instruction=(
+            'You are an expert file analyzer , and your job is to answer '
+            'the user\'s query based on the file you have access to.'
+        ),
+        contents=[file_path],
+        ttl=datetime.timedelta(minutes=15),
+    )
+
+    # Construct a GenerativeModel which uses the created cache.
+    model = genai.GenerativeModel.from_cached_content(cached_content=cache)
+
+    # Query the model
+    response = model.generate_content([(
+        'SUmmarize the given file '
+        'in 10 lines '
+        'list main points')])
+
+    #print(response.usage_metadata)
+    return(response.text)
--- a/lib/workspace/alwrity_config/main_config.json
+++ b/lib/workspace/alwrity_config/main_config.json
@@ -2,7 +2,7 @@
    "Blog Content Characteristics": {
        "Blog Length": "2000",
        "Blog Tone": "Casual",
-        "Blog Demographic": "Professional",
+        "Blog Demographic": "Digital Marketing",
        "Blog Type": "Informational",
        "Blog Language": "English",
        "Blog Output Format": "markdown"
--- a/lib/workspace/alwrity_prompts/long_form_ai_writer.prompts
+++ b/lib/workspace/alwrity_prompts/long_form_ai_writer.prompts
@@ -62,7 +62,7 @@ starting_prompt: |
  First, silently review the content outline and title. Consider how to begin writing your content. Take your time.
  Start by writing the very beginning of the outline. You are not expected to finish the entire content now.
  Your writing should be detailed, only scratching the surface of the first bullet of your outline.
-  Try to write AT MINIMUM 1000 WORDS.
+  Try to write AT MINIMUM 2000 WORDS.

  {writing_guidelines}

@@ -92,7 +92,7 @@ continuation_prompt: |

  Your task is to continue writing from where you left off and cover the next part of the outline. 
  You are not expected to finish the entire content now. 
-  Aim to write at least 1000 words. 
+  Aim to write at least 800 words. 
  However, only once the entire content is completely finished, write IAMDONE.

  {writing_guidelines}