Youtube AI Writer Tools

2025-04-08 18:37:35 +05:30
parent 940a4ad1fa
commit 8312dbaaac
18 changed files with 3979 additions and 274 deletions
--- a/lib/gpt_providers/text_to_image_generation/gen_gemini_images.py
+++ b/lib/gpt_providers/text_to_image_generation/gen_gemini_images.py
@@ -0,0 +1,377 @@
+import os
+from PIL import Image
+from io import BytesIO
+import PIL
+import streamlit as st
+from google import genai
+from google.genai import types
+import logging
+import datetime
+import base64
+import random
+import time
+
+
+from .save_image import save_generated_image
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger('gemini_image_generator')
+
+# With image generation in Gemini, your imagination is the limit. 
+# If what you see doesn't quite match what you had in mind, try adding more details to the prompt. 
+# The more specific you are, the better Gemini can create images that reflect your vision.
+
+# Generate images using Gemini
+# Gemini 2.0 Flash Experimental supports the ability to output text and inline images. 
+# This lets you use Gemini to conversationally edit images or generate outputs with interwoven text (for example, generating a blog post with text and images in a single turn).
+# Note: Make sure to include responseModalities: ["Text", "Image"] in your generation configuration for text and image output with gemini-2.0-flash-exp-image-generation. Image only is not allowed.
+
+
+class AIPromptGenerator:
+    """
+    Generates enhanced AI image prompts based on user keywords,
+    following the guidelines of the Imagen documentation.
+    """
+
+    def __init__(self):
+        self.photography_styles = ["photo", "photograph"]
+        self.art_styles = ["painting", "sketch", "drawing", "illustration", "digital art", "render"]
+        self.art_techniques = ["technical pencil drawing", "charcoal drawing", "color pencil drawing", "pastel painting", "digital art", "art deco (poster)", "impressionist painting", "renaissance painting", "pop art"]
+        self.camera_proximity = ["close-up", "zoomed out", "taken from far away"]
+        self.camera_position = ["aerial", "from below"]
+        self.lighting = ["natural lighting", "dramatic lighting", "warm lighting", "cold lighting", "studio lighting", "golden hour lighting"]
+        self.camera_settings = ["motion blur", "soft focus", "bokeh", "portrait"]
+        self.lens_types = ["35mm lens", "50mm lens", "fisheye lens", "wide angle lens", "macro lens", "telephoto lens"]
+        self.film_types = ["black and white film", "polaroid"]
+        self.materials = ["made of cheese", "made of paper", "made of neon tubes", "metallic", "glass", "wooden", "stone"]
+        self.shapes = ["in the shape of a bird", "angular", "curved", "geometric"]
+        self.quality_modifiers_general = ["high-quality", "beautiful", "stylized", "detailed", "epic", "grand"]
+        self.quality_modifiers_photo = ["4K", "HDR", "studio photo", "professional photo", "photorealistic"]
+        self.quality_modifiers_art = ["by a professional artist", "intricate details", "masterpiece"]
+        self.aspect_ratios = ["1:1 aspect ratio", "4:3 aspect ratio", "3:4 aspect ratio", "16:9 aspect ratio", "9:16 aspect ratio"]
+        self.photorealistic_modifiers = {
+            "portraits": ["prime lens", "zoom lens", "24-35mm", "black and white film", "film noir", "shallow depth of field", "duotone (mention two colors)"],
+            "objects": ["macro lens", "60-105mm", "high detail", "precise focusing", "controlled lighting"],
+            "motion": ["telephoto zoom lens", "100-400mm", "fast shutter speed", "action shot", "movement tracking"],
+            "wide-angle": ["wide-angle lens", "10-24mm", "long exposure", "sharp focus", "smooth water or clouds", "astro photography"]
+        }
+
+    def generate_prompt(self, keywords):
+        """
+        Generates an enhanced AI image prompt based on user-provided keywords.
+
+        Args:
+            keywords (list): A list of keywords describing the desired image.
+
+        Returns:
+            str: An enhanced AI image prompt.
+        """
+        if not keywords:
+            return "A beautiful image."
+
+        prompt_parts = []
+        subject = " ".join(keywords)
+        prompt_parts.append(subject)
+
+        # Add context and background (optional)
+        context_options = ["in a detailed background", "outdoors", "indoors", "in a studio", "with a blurred background"]
+        if random.random() < 0.6:  # Add context with a probability
+            prompt_parts.append(random.choice(context_options))
+
+        # Add style (optional)
+        style_options = self.photography_styles + [f"{art} of" for art in self.art_styles]
+        if random.random() < 0.7:
+            prompt_parts.insert(0, random.choice(style_options))
+            if prompt_parts[0].startswith("painting of") or prompt_parts[0].startswith("sketch of") or prompt_parts[0].startswith("drawing of"):
+                if random.random() < 0.5:
+                    prompt_parts.append(f"in the style of {random.choice(self.art_techniques)}")
+
+        # Add photography modifiers (if photography style is chosen)
+        if any(style in prompt_parts[0] for style in self.photography_styles):
+            if random.random() < 0.4:
+                prompt_parts.append(random.choice(self.camera_proximity))
+            if random.random() < 0.3:
+                prompt_parts.append(random.choice(self.camera_position))
+            if random.random() < 0.5:
+                prompt_parts.append(random.choice(self.lighting))
+            if random.random() < 0.3:
+                prompt_parts.append(random.choice(self.camera_settings))
+            if random.random() < 0.2:
+                prompt_parts.append(random.choice(self.lens_types))
+            if random.random() < 0.1:
+                prompt_parts.append(random.choice(self.film_types))
+
+        # Add shapes and materials (optional)
+        if random.random() < 0.3:
+            prompt_parts.append(random.choice(self.materials))
+        if random.random() < 0.2:
+            prompt_parts.append(random.choice(self.shapes))
+
+        # Add quality modifiers (optional)
+        if random.random() < 0.6:
+            quality_options = self.quality_modifiers_general
+            if any(style in prompt_parts[0] for style in self.photography_styles):
+                quality_options += self.quality_modifiers_photo
+            else:
+                quality_options += self.quality_modifiers_art
+            prompt_parts.append(random.choice(list(set(quality_options)))) # Avoid duplicates
+
+        # Add aspect ratio (optional)
+        if random.random() < 0.2:
+            prompt_parts.append(random.choice(self.aspect_ratios))
+
+        return ", ".join(prompt_parts)
+
+    def generate_photorealistic_prompt(self, keywords, focus=""):
+        """
+        Generates an enhanced AI image prompt specifically for photorealistic images.
+
+        Args:
+            keywords (list): A list of keywords describing the desired image.
+            focus (str, optional): The focus of the photorealistic image (e.g., "portraits", "objects", "motion", "wide-angle"). Defaults to "".
+
+        Returns:
+            str: An enhanced photorealistic AI image prompt.
+        """
+        if not keywords:
+            return "A photorealistic image."
+
+        prompt_parts = ["A photo of", "photorealistic"]
+        prompt_parts.append(" ".join(keywords))
+
+        if focus and focus in self.photorealistic_modifiers:
+            modifiers = self.photorealistic_modifiers[focus]
+            if modifiers:
+                num_modifiers = random.randint(1, min(3, len(modifiers)))
+                selected_modifiers = random.sample(modifiers, num_modifiers)
+                prompt_parts.extend(selected_modifiers)
+
+        # Add general quality modifiers
+        if random.random() < 0.5:
+            prompt_parts.append(random.choice(self.quality_modifiers_photo))
+
+        # Add lighting
+        if random.random() < 0.4:
+            prompt_parts.append(random.choice(self.lighting))
+
+        return ", ".join(prompt_parts)
+
+
+def generate_gemini_image(prompt, keywords=None, style=None, focus=None, enhance_prompt=True, max_retries=3, initial_retry_delay=2):
+    """
+    Generate images using Gemini
+    Depending on the prompt and context, Gemini will generate content in different modes (text to image, text to image and text, etc.). 
+    Here are some examples:
+
+    1). Text to image
+    Example prompt: "Generate an image of the Eiffel tower with fireworks in the background."
+    2). Text to image(s) and text (interleaved)
+    Example prompt: "Generate an illustrated recipe for a paella."
+
+    Image generation may not always trigger:
+    - The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image").
+    - The model may stop generating partway through. Try again or try a different prompt.
+
+    Args:
+        prompt (str): The prompt to generate the image from.
+        keywords (list, optional): Keywords to enhance the prompt. Defaults to None.
+        style (str, optional): The style of the image. Defaults to None.
+        focus (str, optional): The focus of the image (e.g., "portraits", "objects", "motion", "wide-angle"). Defaults to None.
+        enhance_prompt (bool, optional): Whether to enhance the prompt using AIPromptGenerator. Defaults to True.
+        max_retries (int, optional): Maximum number of retry attempts for handling 503 errors. Defaults to 3.
+        initial_retry_delay (int, optional): Initial delay in seconds before retrying. Defaults to 2.
+
+    Returns:
+        str: The path to the generated image.
+    """
+    logger.info(f"Generating image with prompt: '{prompt[:100]}...'")
+    
+    # Enhance the prompt if requested
+    if enhance_prompt and keywords:
+        prompt_generator = AIPromptGenerator()
+        if style == "photorealistic" and focus:
+            logger.info(f"Generating photorealistic prompt with focus: {focus}")
+            enhanced_prompt = prompt_generator.generate_photorealistic_prompt(keywords, focus)
+        else:
+            logger.info("Generating enhanced prompt")
+            enhanced_prompt = prompt_generator.generate_prompt(keywords)
+        
+        # Combine the enhanced prompt with the original prompt
+        prompt = f"{prompt}\n\nEnhanced prompt: {enhanced_prompt}"
+        logger.info(f"Final prompt: '{prompt[:100]}...'")
+    
+    retry_count = 0
+    retry_delay = initial_retry_delay
+    
+    while retry_count <= max_retries:
+        try:
+            client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
+            contents = (prompt)
+
+            logger.info("Sending request to Gemini API")
+            response = client.models.generate_content(
+                model="gemini-2.0-flash-exp-image-generation",
+                contents=contents,
+                config=types.GenerateContentConfig(
+                    response_modalities=['Text', 'Image']
+                )
+            )
+            logger.info("Received response from Gemini API")
+
+            img_name = None
+            for part in response.candidates[0].content.parts:
+                if part.text is not None:
+                    logger.info(f"Received text response: '{part.text[:100]}...'")
+                    print(part.text)
+                elif part.inline_data is not None:
+                    logger.info("Received image data from Gemini")
+                    image = Image.open(BytesIO((part.inline_data.data)))
+                    image.show()
+                    if part.text is not None:
+                        img_name = f'{part.text}-gemini-native-image.png'
+                    else:
+                        img_name = f'gemini-native-image-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.png'
+                    try:
+                        logger.info(f"Saving image to: {img_name}")
+                        image.save(img_name)
+                        
+                        # Create a dictionary with the expected format for save_generated_image
+                        img_response = {
+                            "artifacts": [
+                                {
+                                    "base64": base64.b64encode(open(img_name, "rb").read()).decode('utf-8')
+                                }
+                            ]
+                        }
+                        
+                        # Call save_generated_image with the correct format
+                        save_generated_image(img_response)
+                    except Exception as err:
+                        logger.error(f"Failed to save image: {err}")
+                        st.error(f"Failed to save image: {err}")
+            
+            logger.info(f"Image generation completed. Image name: {img_name}")
+            return img_name
+        except Exception as err:
+            error_message = str(err)
+            logger.error(f"Error in generate_gemini_image: {err}")
+            
+            # Check if this is a 503 UNAVAILABLE error
+            if "503 UNAVAILABLE" in error_message and retry_count < max_retries:
+                retry_count += 1
+                logger.info(f"Model is overloaded. Retrying in {retry_delay} seconds (attempt {retry_count}/{max_retries})")
+                st.warning(f"The image generation service is currently busy. Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                # Exponential backoff
+                retry_delay *= 2
+            else:
+                st.error(f"Error generating image: {err}")
+                return None
+    
+    # If we've exhausted all retries
+    st.error("The image generation service is currently unavailable. Please try again later.")
+    return None
+
+
+def edit_image(image_path, prompt, max_retries=3, initial_retry_delay=2):
+    """
+    - Image editing (text and image to image)
+    Example prompt: "Edit this image to make it look like a cartoon"
+    Example prompt: [image of a cat] + [image of a pillow] + "Create a cross stitch of my cat on this pillow."
+    
+    - Multi-turn image editing (chat)
+    Example prompts: [upload an image of a blue car.] "Turn this car into a convertible." "Now change the color to yellow."
+    
+    Image editing with Gemini
+    To perform image editing, add an image as input. 
+    The following example demonstrats uploading base64 encoded images. 
+    For multiple images and larger payloads, check the image input section.
+
+    Args:
+        image_path (str): The path to the image to edit.
+        prompt (str): The prompt to edit the image with.
+        max_retries (int, optional): Maximum number of retry attempts for handling 503 errors. Defaults to 3.
+        initial_retry_delay (int, optional): Initial delay in seconds before retrying. Defaults to 2.
+
+    Returns:
+        str: The path to the edited image.
+    """
+    import PIL.Image
+    image = PIL.Image.open(image_path)
+
+    retry_count = 0
+    retry_delay = initial_retry_delay
+    
+    while retry_count <= max_retries:
+        try:
+            client = genai.Client()
+            text_input = (prompt)
+
+            logger.info("Sending request to Gemini API for image editing")
+            response = client.models.generate_content(
+                model="gemini-2.0-flash-exp-image-generation",
+                contents=[text_input, image],
+                config=types.GenerateContentConfig(
+                    response_modalities=['Text', 'Image']
+                )
+            )
+            logger.info("Received response from Gemini API for image editing")
+
+            edited_img_name = None
+            for part in response.candidates[0].content.parts:
+                if part.text is not None:
+                    logger.info(f"Received text response: '{part.text[:100]}...'")
+                    st.write(part.text)
+                elif part.inline_data is not None:
+                    logger.info("Received edited image data from Gemini")
+                    edited_image = Image.open(BytesIO(part.inline_data.data))
+                    edited_image.show()
+                    
+                    # Save the edited image
+                    edited_img_name = f'edited-{os.path.basename(image_path)}'
+                    try:
+                        logger.info(f"Saving edited image to: {edited_img_name}")
+                        edited_image.save(edited_img_name)
+                        
+                        # Create a dictionary with the expected format for save_generated_image
+                        img_response = {
+                            "artifacts": [
+                                {
+                                    "base64": base64.b64encode(open(edited_img_name, "rb").read()).decode('utf-8')
+                                }
+                            ]
+                        }
+                        
+                        # Call save_generated_image with the correct format
+                        save_generated_image(img_response)
+                    except Exception as err:
+                        logger.error(f"Failed to save edited image: {err}")
+                        st.error(f"Failed to save edited image: {err}")
+            
+            logger.info(f"Image editing completed. Edited image name: {edited_img_name}")
+            return edited_img_name
+        except Exception as err:
+            error_message = str(err)
+            logger.error(f"Error in edit_image: {err}")
+            
+            # Check if this is a 503 UNAVAILABLE error
+            if "503 UNAVAILABLE" in error_message and retry_count < max_retries:
+                retry_count += 1
+                logger.info(f"Model is overloaded. Retrying in {retry_delay} seconds (attempt {retry_count}/{max_retries})")
+                st.warning(f"The image editing service is currently busy. Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                # Exponential backoff
+                retry_delay *= 2
+            else:
+                st.error(f"Error editing image: {err}")
+                return None
+    
+    # If we've exhausted all retries
+    st.error("The image editing service is currently unavailable. Please try again later.")
+    return None
+
+
--- a/lib/gpt_providers/text_to_image_generation/main_generate_image_from_prompt.py
+++ b/lib/gpt_providers/text_to_image_generation/main_generate_image_from_prompt.py
@@ -25,7 +25,7 @@ logger.add(sys.stdout,
 from .gen_dali3_images import generate_dalle3_images
 from .gen_stabl_diff_img import generate_stable_diffusion_image
 from ..text_generation.main_text_generation import llm_text_gen
-
+from .gen_gemini_images import generate_gemini_image

 def generate_image(user_prompt):
    """
@@ -44,7 +44,7 @@ def generate_image(user_prompt):
    --> user (str): A unique identifier representing your end-user, which will help OpenAI to monitor and detect abuse.
    """
    # FIXME: Need to remove default value to match sidebar input.
-    image_engine = 'Stability-AI'
+    image_engine = 'Gemini-AI'
    image_stored_at = None

    if user_prompt:
@@ -57,6 +57,9 @@ def generate_image(user_prompt):
                logger.info(f"Calling Stable diffusion text-to-image with prompt: \n{img_prompt}")
                print("\n\n")
                image_stored_at = generate_stable_diffusion_image(img_prompt)
+            elif 'Gemini-AI' in image_engine:
+                logger.info(f"Calling Gemini text-to-image with prompt: \n{img_prompt}")
+                image_stored_at = generate_gemini_image(img_prompt)
            return image_stored_at
        except Exception as err:
            logger.error(f"Failed to generate Image: {err}")