ALwrity AI Blog Writer - Added Google Grounding UI Implementation

2025-09-18 18:45:53 +05:30
parent 9f13daf443
commit 4d153b292d
72 changed files with 11944 additions and 1526 deletions
--- a/backend/services/llm_providers/gemini_grounded_provider.py
+++ b/backend/services/llm_providers/gemini_grounded_provider.py
@@ -89,12 +89,13 @@ class GeminiGroundedProvider:
                    logger.warning(f"URL Context tool not available in SDK version: {tool_err}")
            
            # Apply mode presets (Draft vs Polished)
-            model_id = "gemini-2.5-flash"
+            # Use Gemini 2.0 Flash for better content generation with grounding
+            model_id = "gemini-2.0-flash"
            if mode == "draft":
-                model_id = "gemini-2.5-flash-lite"
+                model_id = "gemini-2.0-flash"
                temperature = min(1.0, max(0.0, temperature))
            else:
-                model_id = "gemini-2.5-flash"
+                model_id = "gemini-2.0-flash"

            # Configure generation settings
            config = types.GenerateContentConfig(
@@ -189,7 +190,7 @@ class GeminiGroundedProvider:
                loop.run_in_executor(
                    executor,
                    lambda: self.client.models.generate_content(
-                        model="gemini-2.5-flash",
+                        model="gemini-2.0-flash",
                        contents=grounded_prompt,
                        config=config,
                    )
@@ -199,6 +200,10 @@ class GeminiGroundedProvider:

    async def _make_api_request_with_model(self, grounded_prompt: str, config: Any, model_id: str, urls: Optional[List[str]] = None):
        """Make the API request with explicit model id and optional URL injection."""
+        logger.info(f"🔍 DEBUG: Making API request with model: {model_id}")
+        logger.info(f"🔍 DEBUG: Prompt length: {len(grounded_prompt)} characters")
+        logger.info(f"🔍 DEBUG: Prompt preview (first 300 chars): {grounded_prompt[:300]}...")
+        
        import concurrent.futures
        loop = asyncio.get_event_loop()
        with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -310,23 +315,70 @@ class GeminiGroundedProvider:
            Processed content with sources and citations
        """
        try:
-            # Extract the main content
+            # Debug: Log response structure
+            logger.info(f"🔍 DEBUG: Response type: {type(response)}")
+            logger.info(f"🔍 DEBUG: Response has 'text': {hasattr(response, 'text')}")
+            logger.info(f"🔍 DEBUG: Response has 'candidates': {hasattr(response, 'candidates')}")
+            logger.info(f"🔍 DEBUG: Response has 'grounding_metadata': {hasattr(response, 'grounding_metadata')}")
+            if hasattr(response, 'grounding_metadata'):
+                logger.info(f"🔍 DEBUG: Grounding metadata: {response.grounding_metadata}")
+            if hasattr(response, 'candidates') and response.candidates:
+                logger.info(f"🔍 DEBUG: Number of candidates: {len(response.candidates)}")
+                candidate = response.candidates[0]
+                logger.info(f"🔍 DEBUG: Candidate type: {type(candidate)}")
+                logger.info(f"🔍 DEBUG: Candidate has 'content': {hasattr(candidate, 'content')}")
+                if hasattr(candidate, 'content') and candidate.content:
+                    logger.info(f"🔍 DEBUG: Content type: {type(candidate.content)}")
+                    # Check if content is a list or single object
+                    if hasattr(candidate.content, '__iter__') and not isinstance(candidate.content, str):
+                        try:
+                            content_length = len(candidate.content) if candidate.content else 0
+                            logger.info(f"🔍 DEBUG: Content is iterable, length: {content_length}")
+                        except TypeError:
+                            logger.info(f"🔍 DEBUG: Content is iterable but has no len() - treating as single object")
+                        for i, part in enumerate(candidate.content):
+                            logger.info(f"🔍 DEBUG: Part {i} type: {type(part)}")
+                            logger.info(f"🔍 DEBUG: Part {i} has 'text': {hasattr(part, 'text')}")
+                            if hasattr(part, 'text'):
+                                logger.info(f"🔍 DEBUG: Part {i} text length: {len(part.text) if part.text else 0}")
+                    else:
+                        logger.info(f"🔍 DEBUG: Content is single object, has 'text': {hasattr(candidate.content, 'text')}")
+                        if hasattr(candidate.content, 'text'):
+                            logger.info(f"🔍 DEBUG: Content text length: {len(candidate.content.text) if candidate.content.text else 0}")
+            
+            # Extract the main content - prioritize response.text as it's more reliable
            content = ""
            if hasattr(response, 'text'):
-                content = response.text
+                logger.info(f"🔍 DEBUG: response.text exists, value: '{response.text}', type: {type(response.text)}")
+                if response.text:
+                    content = response.text
+                    logger.info(f"🔍 DEBUG: Using response.text, length: {len(content)}")
+                else:
+                    logger.info(f"🔍 DEBUG: response.text is empty or None")
            elif hasattr(response, 'candidates') and response.candidates:
                candidate = response.candidates[0]
                if hasattr(candidate, 'content') and candidate.content:
-                    # Extract text from content parts
-                    text_parts = []
-                    for part in candidate.content:
-                        if hasattr(part, 'text'):
-                            text_parts.append(part.text)
-                    content = " ".join(text_parts)
+                    # Handle both single Content object and list of parts
+                    if hasattr(candidate.content, '__iter__') and not isinstance(candidate.content, str):
+                        # Content is a list of parts
+                        text_parts = []
+                        for part in candidate.content:
+                            if hasattr(part, 'text'):
+                                text_parts.append(part.text)
+                        content = " ".join(text_parts)
+                        logger.info(f"🔍 DEBUG: Using candidate.content (list), extracted {len(text_parts)} parts, total length: {len(content)}")
+                    else:
+                        # Content is a single object
+                        if hasattr(candidate.content, 'text'):
+                            content = candidate.content.text
+                            logger.info(f"🔍 DEBUG: Using candidate.content (single), text length: {len(content)}")
+                        else:
+                            logger.warning("🔍 DEBUG: candidate.content has no 'text' attribute")
            
            logger.info(f"Extracted content length: {len(content) if content else 0}")
            if not content:
-                logger.warning("No content extracted from response")
+                logger.warning("⚠️ No content extracted from Gemini response - using fallback content")
+                logger.warning("⚠️ This indicates Google Search grounding is not working properly")
                content = "Generated content about the requested topic."
            
            # Initialize result structure
--- a/backend/services/llm_providers/gemini_provider.py
+++ b/backend/services/llm_providers/gemini_provider.py
@@ -440,7 +440,8 @@ def gemini_structured_json_response(prompt, schema, temperature=0.7, top_p=0.9,
        return {"error": str(e)}


-def _repair_json_string(text: str) -> Optional[str]:
+# Removed JSON repair functions to avoid false positives
+def _removed_repair_json_string(text: str) -> Optional[str]:
    """
    Attempt to repair common JSON issues in AI responses.
    """
@@ -489,13 +490,21 @@ def _repair_json_string(text: str) -> Optional[str]:
        fixed_lines.append(line)
    repaired = '\n'.join(fixed_lines)
    
-    # 3. Fix unescaped quotes in string values
-    # This is complex - we'll use a simple approach
+    # 3. Fix unterminated strings (common issue with AI responses)
    try:
-        # Try to balance quotes by adding missing ones
+        # Handle unterminated strings by finding the last incomplete string and closing it
        lines = repaired.split('\n')
        fixed_lines = []
-        for line in lines:
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            # Check for unterminated strings (line ends with quote but no closing quote)
+            if stripped.endswith('"') and i < len(lines) - 1:
+                next_line = lines[i + 1].strip()
+                # If next line doesn't start with quote or closing bracket, we might have an unterminated string
+                if not next_line.startswith('"') and not next_line.startswith(']') and not next_line.startswith('}'):
+                    # Check if this looks like an unterminated string value
+                    if ':' in line and not line.strip().endswith('",'):
+                        line = line + '",'
            # Count quotes in the line
            quote_count = line.count('"')
            if quote_count % 2 == 1:  # Odd number of quotes
@@ -518,7 +527,8 @@ def _repair_json_string(text: str) -> Optional[str]:
    return repaired


-def _extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
+# Removed partial JSON extraction to avoid false positives
+def _removed_extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
    """
    Extract partial JSON from truncated responses.
    Attempts to salvage as much data as possible from incomplete JSON.
@@ -572,26 +582,77 @@ def _extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
            # Try to extract individual fields as a last resort
            fields = {}
            
-            # Extract key-value pairs using regex
-            kv_pattern = r'"([^"]+)"\s*:\s*"([^"]*)"'
-            matches = re.findall(kv_pattern, json_text)
-            for key, value in matches:
-                fields[key] = value
+            # Extract key-value pairs using regex (more comprehensive patterns)
+            kv_patterns = [
+                r'"([^"]+)"\s*:\s*"([^"]*)"',  # "key": "value"
+                r'"([^"]+)"\s*:\s*(\d+)',      # "key": 123
+                r'"([^"]+)"\s*:\s*(true|false)', # "key": true/false
+                r'"([^"]+)"\s*:\s*null',       # "key": null
+            ]
            
-            # Extract array fields
+            for pattern in kv_patterns:
+                matches = re.findall(pattern, json_text)
+                for key, value in matches:
+                    if value == 'true':
+                        fields[key] = True
+                    elif value == 'false':
+                        fields[key] = False
+                    elif value == 'null':
+                        fields[key] = None
+                    elif value.isdigit():
+                        fields[key] = int(value)
+                    else:
+                        fields[key] = value
+            
+            # Extract array fields (more robust)
            array_pattern = r'"([^"]+)"\s*:\s*\[([^\]]*)\]'
            array_matches = re.findall(array_pattern, json_text)
            for key, array_content in array_matches:
-                # Parse array items
+                # Parse array items more comprehensively
                items = []
-                item_pattern = r'"([^"]*)"'
-                item_matches = re.findall(item_pattern, array_content)
-                items.extend(item_matches)
-                fields[key] = items
+                # Look for quoted strings, numbers, booleans, null
+                item_patterns = [
+                    r'"([^"]*)"',  # quoted strings
+                    r'(\d+)',      # numbers
+                    r'(true|false)', # booleans
+                    r'(null)',     # null
+                ]
+                for pattern in item_patterns:
+                    item_matches = re.findall(pattern, array_content)
+                    for match in item_matches:
+                        if match == 'true':
+                            items.append(True)
+                        elif match == 'false':
+                            items.append(False)
+                        elif match == 'null':
+                            items.append(None)
+                        elif match.isdigit():
+                            items.append(int(match))
+                        else:
+                            items.append(match)
+                if items:
+                    fields[key] = items
+            
+            # Extract nested object fields (basic)
+            object_pattern = r'"([^"]+)"\s*:\s*\{([^}]*)\}'
+            object_matches = re.findall(object_pattern, json_text)
+            for key, object_content in object_matches:
+                # Simple nested object extraction
+                nested_fields = {}
+                nested_kv_matches = re.findall(r'"([^"]+)"\s*:\s*"([^"]*)"', object_content)
+                for nested_key, nested_value in nested_kv_matches:
+                    nested_fields[nested_key] = nested_value
+                if nested_fields:
+                    fields[key] = nested_fields
            
            if fields:
-                logger.info(f"Extracted {len(fields)} fields from truncated JSON")
-                return fields
+                logger.info(f"Extracted {len(fields)} fields from truncated JSON: {list(fields.keys())}")
+                # Only return if we have a valid outline structure
+                if 'outline' in fields and isinstance(fields['outline'], list):
+                    return {'outline': fields['outline']}
+                else:
+                    logger.error("No valid 'outline' field found in partial JSON")
+                    return None
            
            return None
            
@@ -600,7 +661,8 @@ def _extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
        return None


-def _extract_key_value_pairs(text: str) -> Optional[Dict[str, Any]]:
+# Removed key-value extraction to avoid false positives
+def _removed_extract_key_value_pairs(text: str) -> Optional[Dict[str, Any]]:
    """
    Extract key-value pairs from malformed JSON text as a last resort.
    """