Google trends data and keyword research

This commit is contained in:
ajaysi
2025-04-05 22:50:43 +05:30
parent a2fb77f700
commit 579bf7d0a6
5 changed files with 1122 additions and 379 deletions

View File

@@ -27,15 +27,19 @@ from pathlib import Path
import sys
from datetime import datetime
import streamlit as st
import pandas as pd
import random
import numpy as np
from lib.alwrity_ui.display_google_serp_results import (
process_research_results,
process_search_results,
display_research_results
)
from lib.alwrity_ui.google_trends_ui import display_google_trends_data, process_trends_data
from .tavily_ai_search import get_tavilyai_results
from .metaphor_basic_neural_web_search import metaphor_search_articles
from .metaphor_basic_neural_web_search import metaphor_search_articles, streamlit_display_metaphor_results
from .google_serp_search import google_search
from .google_trends_researcher import do_google_trends_analysis
#from .google_gemini_web_researcher import do_gemini_web_research
@@ -56,6 +60,10 @@ def gpt_web_researcher(search_keywords, search_mode, **kwargs):
logger.debug(f"Additional parameters: {kwargs}")
try:
# Reset session state variables for this research operation
if 'metaphor_results_displayed' in st.session_state:
del st.session_state.metaphor_results_displayed
# Initialize result container
research_results = None
@@ -157,13 +165,76 @@ def gpt_web_researcher(search_keywords, search_mode, **kwargs):
update_progress("Metaphor AI search failed, continuing with Tavily results only...", level="warning")
else:
update_progress("Metaphor AI search completed successfully", progress=75)
# Add debug logging to check the structure of metaphor_results
logger.debug(f"Metaphor results structure: {type(metaphor_results)}")
if isinstance(metaphor_results, dict):
logger.debug(f"Metaphor results keys: {metaphor_results.keys()}")
if 'data' in metaphor_results:
logger.debug(f"Metaphor data keys: {metaphor_results['data'].keys()}")
if 'results' in metaphor_results['data']:
logger.debug(f"Number of results: {len(metaphor_results['data']['results'])}")
# Display Metaphor results only if not already displayed
if 'metaphor_results_displayed' not in st.session_state:
st.session_state.metaphor_results_displayed = True
# Make sure to pass the correct parameters to streamlit_display_metaphor_results
streamlit_display_metaphor_results(metaphor_results, search_keywords)
# Add Google Trends Analysis
update_progress("Initiating Google Trends analysis...", progress=80)
try:
# Add an informative message about Google Trends
with st.expander(" About Google Trends Analysis", expanded=False):
st.markdown("""
**What is Google Trends Analysis?**
Google Trends Analysis provides insights into how often a particular search-term is entered relative to the total search-volume across various regions of the world, and in various languages.
**What data will be shown?**
- **Related Keywords**: Terms that are frequently searched together with your keyword
- **Interest Over Time**: How interest in your keyword has changed over the past 12 months
- **Regional Interest**: Where in the world your keyword is most popular
- **Related Queries**: What people search for before and after searching for your keyword
- **Related Topics**: Topics that are closely related to your keyword
**How to use this data:**
- Identify trending topics in your industry
- Understand seasonal patterns in search behavior
- Discover related keywords for content planning
- Target content to specific regions with high interest
""")
trends_results = do_google_pytrends_analysis(search_keywords)
if trends_results:
update_progress("Google Trends analysis completed successfully", progress=90)
# Store trends results in the research_results
if metaphor_results:
metaphor_results['trends_data'] = trends_results
else:
# If metaphor_results is None, create a new container for results
metaphor_results = {'trends_data': trends_results}
# Display Google Trends data using the new UI module
display_google_trends_data(trends_results, search_keywords)
else:
update_progress("Google Trends analysis returned no results", level="warning")
except Exception as trends_err:
logger.error(f"Google Trends analysis failed: {trends_err}")
update_progress("Google Trends analysis failed", level="warning")
st.error(f"Error in Google Trends analysis: {str(trends_err)}")
# Return the combined results
update_progress("Research completed!", progress=100, level="success")
return metaphor_results or t_results
except Exception as ai_err:
error_msg = f"AI research pipeline failed: {str(ai_err)}"
logger.error(error_msg, exc_info=True)
update_progress(error_msg, level="error")
raise
else:
error_msg = f"Unsupported search mode: {search_mode}"
logger.error(error_msg)
@@ -316,13 +387,355 @@ def do_metaphor_ai_research(search_keywords):
return None, None
def do_google_pytrends_analysis(search_keywords):
""" """
def do_google_pytrends_analysis(keywords):
"""
Perform Google Trends analysis for the given keywords.
Args:
keywords (str): The search keywords to analyze
Returns:
dict: A dictionary containing formatted Google Trends data with the following keys:
- related_keywords: List of related keywords
- interest_over_time: DataFrame with date and interest columns
- regional_interest: DataFrame with country_code, country, and interest columns
- related_queries: DataFrame with query and value columns
- related_topics: DataFrame with topic and value columns
"""
logger.info(f"Performing Google Trends analysis for keywords: {keywords}")
# Create a progress container for Streamlit
progress_container = st.empty()
progress_bar = st.progress(0)
def update_progress(message, progress=None, level="info"):
"""Helper function to update progress in Streamlit UI"""
if progress is not None:
progress_bar.progress(progress)
if level == "error":
progress_container.error(f"🚫 {message}")
elif level == "warning":
progress_container.warning(f"⚠️ {message}")
else:
progress_container.info(f"🔄 {message}")
logger.debug(f"Progress update [{level}]: {message}")
try:
logger.info(f"Do Google Trends analysis for given keywords: {search_keywords}")
return(do_google_trends_analysis(search_keywords))
except Exception as err:
logger.error(f"Failed to do google trends analysis: {err}")
# Initialize the formatted data dictionary
formatted_data = {
'related_keywords': [],
'interest_over_time': pd.DataFrame(),
'regional_interest': pd.DataFrame(),
'related_queries': pd.DataFrame(),
'related_topics': pd.DataFrame()
}
# Get raw trends data from google_trends_researcher
update_progress("Fetching Google Trends data...", progress=10)
raw_trends_data = do_google_trends_analysis(keywords)
if not raw_trends_data:
logger.warning("No Google Trends data returned")
update_progress("No Google Trends data returned", level="warning", progress=20)
return formatted_data
# Process related keywords from the raw data
update_progress("Processing related keywords...", progress=30)
if isinstance(raw_trends_data, list):
formatted_data['related_keywords'] = raw_trends_data
elif isinstance(raw_trends_data, dict):
if 'keywords' in raw_trends_data:
formatted_data['related_keywords'] = raw_trends_data['keywords']
if 'interest_over_time' in raw_trends_data:
formatted_data['interest_over_time'] = raw_trends_data['interest_over_time']
if 'regional_interest' in raw_trends_data:
formatted_data['regional_interest'] = raw_trends_data['regional_interest']
if 'related_queries' in raw_trends_data:
formatted_data['related_queries'] = raw_trends_data['related_queries']
if 'related_topics' in raw_trends_data:
formatted_data['related_topics'] = raw_trends_data['related_topics']
# If we have keywords but missing other data, try to fetch them using pytrends directly
if formatted_data['related_keywords'] and (
formatted_data['interest_over_time'].empty or
formatted_data['regional_interest'].empty or
formatted_data['related_queries'].empty or
formatted_data['related_topics'].empty
):
try:
update_progress("Fetching additional data from Google Trends API...", progress=40)
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
# Build payload with the main keyword
update_progress("Building search payload...", progress=45)
pytrends.build_payload([keywords], timeframe='today 12-m', geo='')
# Get interest over time if missing
if formatted_data['interest_over_time'].empty:
try:
update_progress("Fetching interest over time data...", progress=50)
interest_df = pytrends.interest_over_time()
if not interest_df.empty:
formatted_data['interest_over_time'] = interest_df.reset_index()
update_progress(f"Successfully fetched interest over time data with {len(formatted_data['interest_over_time'])} data points", progress=55)
else:
update_progress("No interest over time data available", level="warning", progress=55)
except Exception as e:
logger.error(f"Error fetching interest over time: {e}")
update_progress(f"Error fetching interest over time: {str(e)}", level="warning", progress=55)
# Get regional interest if missing
if formatted_data['regional_interest'].empty:
try:
update_progress("Fetching regional interest data...", progress=60)
regional_df = pytrends.interest_by_region()
if not regional_df.empty:
formatted_data['regional_interest'] = regional_df.reset_index()
update_progress(f"Successfully fetched regional interest data for {len(formatted_data['regional_interest'])} regions", progress=65)
else:
update_progress("No regional interest data available", level="warning", progress=65)
except Exception as e:
logger.error(f"Error fetching regional interest: {e}")
update_progress(f"Error fetching regional interest: {str(e)}", level="warning", progress=65)
# Get related queries if missing
if formatted_data['related_queries'].empty:
try:
update_progress("Fetching related queries data...", progress=70)
# Get related queries data
related_queries = pytrends.related_queries()
# Create empty DataFrame as fallback
formatted_data['related_queries'] = pd.DataFrame(columns=['query', 'value'])
# Simple direct approach to avoid list index errors
if related_queries and isinstance(related_queries, dict):
# Check if our keyword exists in the results
if keywords in related_queries:
keyword_data = related_queries[keywords]
# Process top queries if available
if 'top' in keyword_data and keyword_data['top'] is not None:
try:
update_progress("Processing top related queries...", progress=75)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['top'], pd.DataFrame):
top_df = keyword_data['top']
else:
# Try to convert to DataFrame
top_df = pd.DataFrame(keyword_data['top'])
# Ensure it has the right columns
if not top_df.empty:
# Rename columns if needed
if 'query' in top_df.columns:
# Already has the right column name
pass
elif len(top_df.columns) > 0:
# Use first column as query
top_df = top_df.rename(columns={top_df.columns[0]: 'query'})
# Add to our results
formatted_data['related_queries'] = top_df
update_progress(f"Successfully processed {len(top_df)} top related queries", progress=80)
except Exception as e:
logger.warning(f"Error processing top queries: {e}")
update_progress(f"Error processing top queries: {str(e)}", level="warning", progress=80)
# Process rising queries if available
if 'rising' in keyword_data and keyword_data['rising'] is not None:
try:
update_progress("Processing rising related queries...", progress=85)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['rising'], pd.DataFrame):
rising_df = keyword_data['rising']
else:
# Try to convert to DataFrame
rising_df = pd.DataFrame(keyword_data['rising'])
# Ensure it has the right columns
if not rising_df.empty:
# Rename columns if needed
if 'query' in rising_df.columns:
# Already has the right column name
pass
elif len(rising_df.columns) > 0:
# Use first column as query
rising_df = rising_df.rename(columns={rising_df.columns[0]: 'query'})
# Combine with existing data if we have any
if not formatted_data['related_queries'].empty:
formatted_data['related_queries'] = pd.concat([formatted_data['related_queries'], rising_df])
update_progress(f"Successfully processed {len(rising_df)} rising related queries", progress=90)
else:
formatted_data['related_queries'] = rising_df
update_progress(f"Successfully processed {len(rising_df)} rising related queries", progress=90)
except Exception as e:
logger.warning(f"Error processing rising queries: {e}")
update_progress(f"Error processing rising queries: {str(e)}", level="warning", progress=90)
except Exception as e:
logger.error(f"Error fetching related queries: {e}")
update_progress(f"Error fetching related queries: {str(e)}", level="warning", progress=90)
# Ensure we have an empty DataFrame with the right columns
formatted_data['related_queries'] = pd.DataFrame(columns=['query', 'value'])
# Get related topics if missing
if formatted_data['related_topics'].empty:
try:
update_progress("Fetching related topics data...", progress=95)
# Get related topics data
related_topics = pytrends.related_topics()
# Create empty DataFrame as fallback
formatted_data['related_topics'] = pd.DataFrame(columns=['topic', 'value'])
# Simple direct approach to avoid list index errors
if related_topics and isinstance(related_topics, dict):
# Check if our keyword exists in the results
if keywords in related_topics:
keyword_data = related_topics[keywords]
# Process top topics if available
if 'top' in keyword_data and keyword_data['top'] is not None:
try:
update_progress("Processing top related topics...", progress=97)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['top'], pd.DataFrame):
top_df = keyword_data['top']
else:
# Try to convert to DataFrame
top_df = pd.DataFrame(keyword_data['top'])
# Ensure it has the right columns
if not top_df.empty:
# Rename columns if needed
if 'topic_title' in top_df.columns:
top_df = top_df.rename(columns={'topic_title': 'topic'})
elif len(top_df.columns) > 0 and 'topic' not in top_df.columns:
# Use first column as topic
top_df = top_df.rename(columns={top_df.columns[0]: 'topic'})
# Add to our results
formatted_data['related_topics'] = top_df
update_progress(f"Successfully processed {len(top_df)} top related topics", progress=98)
except Exception as e:
logger.warning(f"Error processing top topics: {e}")
update_progress(f"Error processing top topics: {str(e)}", level="warning", progress=98)
# Process rising topics if available
if 'rising' in keyword_data and keyword_data['rising'] is not None:
try:
update_progress("Processing rising related topics...", progress=99)
# Convert to DataFrame if it's not already
if isinstance(keyword_data['rising'], pd.DataFrame):
rising_df = keyword_data['rising']
else:
# Try to convert to DataFrame
rising_df = pd.DataFrame(keyword_data['rising'])
# Ensure it has the right columns
if not rising_df.empty:
# Rename columns if needed
if 'topic_title' in rising_df.columns:
rising_df = rising_df.rename(columns={'topic_title': 'topic'})
elif len(rising_df.columns) > 0 and 'topic' not in rising_df.columns:
# Use first column as topic
rising_df = rising_df.rename(columns={rising_df.columns[0]: 'topic'})
# Combine with existing data if we have any
if not formatted_data['related_topics'].empty:
formatted_data['related_topics'] = pd.concat([formatted_data['related_topics'], rising_df])
update_progress(f"Successfully processed {len(rising_df)} rising related topics", progress=100)
else:
formatted_data['related_topics'] = rising_df
update_progress(f"Successfully processed {len(rising_df)} rising related topics", progress=100)
except Exception as e:
logger.warning(f"Error processing rising topics: {e}")
update_progress(f"Error processing rising topics: {str(e)}", level="warning", progress=100)
except Exception as e:
logger.error(f"Error fetching related topics: {e}")
update_progress(f"Error fetching related topics: {str(e)}", level="warning", progress=100)
# Ensure we have an empty DataFrame with the right columns
formatted_data['related_topics'] = pd.DataFrame(columns=['topic', 'value'])
except Exception as e:
logger.error(f"Error fetching additional trends data: {e}")
update_progress(f"Error fetching additional trends data: {str(e)}", level="warning", progress=100)
# Ensure all DataFrames have the correct column names for the UI
update_progress("Finalizing data formatting...", progress=100)
if not formatted_data['interest_over_time'].empty:
if 'date' not in formatted_data['interest_over_time'].columns:
formatted_data['interest_over_time'] = formatted_data['interest_over_time'].reset_index()
if 'interest' not in formatted_data['interest_over_time'].columns and keywords in formatted_data['interest_over_time'].columns:
formatted_data['interest_over_time'] = formatted_data['interest_over_time'].rename(columns={keywords: 'interest'})
if not formatted_data['regional_interest'].empty:
if 'country_code' not in formatted_data['regional_interest'].columns and 'geoName' in formatted_data['regional_interest'].columns:
formatted_data['regional_interest'] = formatted_data['regional_interest'].rename(columns={'geoName': 'country_code'})
if 'interest' not in formatted_data['regional_interest'].columns and keywords in formatted_data['regional_interest'].columns:
formatted_data['regional_interest'] = formatted_data['regional_interest'].rename(columns={keywords: 'interest'})
if not formatted_data['related_queries'].empty:
# Handle different column names that might be present in the related queries DataFrame
if 'query' not in formatted_data['related_queries'].columns:
if 'Top query' in formatted_data['related_queries'].columns:
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={'Top query': 'query'})
elif 'Rising query' in formatted_data['related_queries'].columns:
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={'Rising query': 'query'})
elif 'query' not in formatted_data['related_queries'].columns and len(formatted_data['related_queries'].columns) > 0:
# If we have a DataFrame but no 'query' column, use the first column as 'query'
first_col = formatted_data['related_queries'].columns[0]
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={first_col: 'query'})
if 'value' not in formatted_data['related_queries'].columns and len(formatted_data['related_queries'].columns) > 1:
# If we have a second column, use it as 'value'
second_col = formatted_data['related_queries'].columns[1]
formatted_data['related_queries'] = formatted_data['related_queries'].rename(columns={second_col: 'value'})
elif 'value' not in formatted_data['related_queries'].columns:
# If no 'value' column exists, add one with default values
formatted_data['related_queries']['value'] = 0
if not formatted_data['related_topics'].empty:
# Handle different column names that might be present in the related topics DataFrame
if 'topic' not in formatted_data['related_topics'].columns:
if 'topic_title' in formatted_data['related_topics'].columns:
formatted_data['related_topics'] = formatted_data['related_topics'].rename(columns={'topic_title': 'topic'})
elif 'topic' not in formatted_data['related_topics'].columns and len(formatted_data['related_topics'].columns) > 0:
# If we have a DataFrame but no 'topic' column, use the first column as 'topic'
first_col = formatted_data['related_topics'].columns[0]
formatted_data['related_topics'] = formatted_data['related_topics'].rename(columns={first_col: 'topic'})
if 'value' not in formatted_data['related_topics'].columns and len(formatted_data['related_topics'].columns) > 1:
# If we have a second column, use it as 'value'
second_col = formatted_data['related_topics'].columns[1]
formatted_data['related_topics'] = formatted_data['related_topics'].rename(columns={second_col: 'value'})
elif 'value' not in formatted_data['related_topics'].columns:
# If no 'value' column exists, add one with default values
formatted_data['related_topics']['value'] = 0
# Clear the progress container after completion
progress_container.empty()
progress_bar.empty()
return formatted_data
except Exception as e:
logger.error(f"Error in Google Trends analysis: {e}")
update_progress(f"Error in Google Trends analysis: {str(e)}", level="error", progress=100)
# Clear the progress container after error
progress_container.empty()
progress_bar.empty()
return {
'related_keywords': [],
'interest_over_time': pd.DataFrame(),
'regional_interest': pd.DataFrame(),
'related_queries': pd.DataFrame(),
'related_topics': pd.DataFrame()
}
def metaphor_extract_titles_or_text(json_data, return_titles=True):