""" This Python script analyzes Google search keywords by fetching auto-suggestions, performing keyword clustering, and visualizing Google Trends data. It uses various libraries such as pytrends, requests_html, tqdm, and more. Features: - Fetches auto-suggestions for a given search keyword from Google. - Performs keyword clustering using K-means algorithm based on TF-IDF vectors. - Visualizes Google Trends data, including interest over time and interest by region. - Retrieves related queries and topics for a set of search keywords. - Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results. - Incorporates logger.for error handling and informative messages. Usage: - Provide a search term or a list of search terms for analysis. - Run the script to fetch auto-suggestions, perform clustering, and visualize Google Trends data. - Explore the displayed results, including top keywords in each cluster and related topics. Modifications: - Customize the search terms in the 'do_google_trends_analysis' function. - Adjust the number of clusters for keyword clustering and other parameters as needed. - Explore further visualizations and analyses based on the generated data. Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'. """ import os import time # I wish import random import requests import numpy as np import sys from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import matplotlib.pyplot as plt from sklearn.metrics import silhouette_score, silhouette_samples from rich.console import Console from rich.progress import Progress import urllib import json import pandas as pd import matplotlib.pyplot as plt import plotly.express as px import plotly.io as pio from requests_html import HTML, HTMLSession from urllib.parse import quote_plus from tqdm import tqdm from tabulate import tabulate from pytrends.request import TrendReq from loguru import logger # Configure logger logger.remove() logger.add(sys.stdout, colorize=True, format="{level}|{file}:{line}:{function}| {message}" ) def fetch_google_trends_interest_overtime(keyword): try: pytrends = TrendReq(hl='en-US', tz=360) pytrends.build_payload([keyword], timeframe='today 1-y', geo='US') # 1. Interest Over Time data = pytrends.interest_over_time() data = data.reset_index() # Visualization using Matplotlib plt.figure(figsize=(10, 6)) plt.plot(data['date'], data[keyword], label=keyword) plt.title(f'Interest Over Time for "{keyword}"') plt.xlabel('Date') plt.ylabel('Interest') plt.legend() plt.show() return data except Exception as e: logger.error(f"Error in fetch_google_trends_data: {e}") return pd.DataFrame() def plot_interest_by_region(kw_list): try: from pytrends.request import TrendReq import matplotlib.pyplot as plt trends = TrendReq() trends.build_payload(kw_list=kw_list) kw_list = ' '.join(kw_list) data = trends.interest_by_region() #sorting by region data = data.sort_values(by=f"{kw_list}", ascending=False) print("\nπŸ“’β—πŸš¨ ") print(f"Top 10 regions with highest interest for keyword: {kw_list}") data = data.head(10) #Top 10 print(data) data.reset_index().plot(x="geoName", y=f"{kw_list}", figsize=(20,15), kind="bar") plt.style.use('fivethirtyeight') plt.show() # FIXME: Send this image to vision GPT for analysis. except Exception as e: print(f"Error plotting interest by region: {e}") return None def get_related_topics_and_save_csv(search_keywords): search_keywords = [f"{search_keywords}"] try: pytrends = TrendReq(hl='en-US', tz=360) pytrends.build_payload(kw_list=search_keywords, timeframe='today 12-m') # Get related topics - this returns a dictionary topics_data = pytrends.related_topics() # Extract data for the first keyword if topics_data and search_keywords[0] in topics_data: keyword_data = topics_data[search_keywords[0]] # Create two separate dataframes for top and rising top_df = keyword_data.get('top', pd.DataFrame()) rising_df = keyword_data.get('rising', pd.DataFrame()) return { 'top': top_df[['topic_title', 'value']] if not top_df.empty else pd.DataFrame(), 'rising': rising_df[['topic_title', 'value']] if not rising_df.empty else pd.DataFrame() } except Exception as e: logger.error(f"Error in related topics: {e}") return {'top': pd.DataFrame(), 'rising': pd.DataFrame()} def get_related_queries_and_save_csv(search_keywords): search_keywords = [f"{search_keywords}"] try: pytrends = TrendReq(hl='en-US', tz=360) pytrends.build_payload(kw_list=search_keywords, timeframe='today 12-m') # Get related queries - this returns a dictionary queries_data = pytrends.related_queries() # Extract data for the first keyword if queries_data and search_keywords[0] in queries_data: keyword_data = queries_data[search_keywords[0]] # Create two separate dataframes for top and rising top_df = keyword_data.get('top', pd.DataFrame()) rising_df = keyword_data.get('rising', pd.DataFrame()) return { 'top': top_df if not top_df.empty else pd.DataFrame(), 'rising': rising_df if not rising_df.empty else pd.DataFrame() } except Exception as e: logger.error(f"Error in related queries: {e}") return {'top': pd.DataFrame(), 'rising': pd.DataFrame()} def get_source(url): try: session = HTMLSession() response = session.get(url) response.raise_for_status() # Raise an HTTPError for bad responses return response except requests.exceptions.RequestException as e: logger.error(f"Error during HTTP request: {e}") return None def get_results(query): try: query = urllib.parse.quote_plus(query) response = get_source(f"https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q={query}") time.sleep(random.uniform(0.1, 0.6)) if response: response.raise_for_status() results = json.loads(response.text) return results else: return None except json.JSONDecodeError as e: logger.error(f"Error decoding JSON response: {e}") return None except requests.exceptions.RequestException as e: logger.error(f"Error during HTTP request: {e}") return None def format_results(results): try: suggestions = [] for index, value in enumerate(results[1]): suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]} suggestions.append(suggestion) return suggestions except (KeyError, IndexError) as e: logger.error(f"Error parsing search results: {e}") return [] def get_expanded_term_suffixes(): return ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm','n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] def get_expanded_term_prefixes(): # For shopping, review type blogs. #return ['discount *', 'pricing *', 'cheap', 'best price *', 'lowest price', 'best value', 'sale', 'affordable', 'promo', 'budget''what *', 'where *', 'how to *', 'why *', 'buy*', 'how much*','best *', 'worse *', 'rent*', 'sale*', 'offer*','vs*','or*'] return ['what *', 'where *', 'how to *', 'why *','best *', 'vs*', 'or*'] def get_expanded_terms(query): try: expanded_term_prefixes = get_expanded_term_prefixes() expanded_term_suffixes = get_expanded_term_suffixes() terms = [query] for term in expanded_term_prefixes: terms.append(f"{term} {query}") for term in expanded_term_suffixes: terms.append(f"{query} {term}") return terms except Exception as e: logger.error(f"Error in get_expanded_terms: {e}") return [] def get_expanded_suggestions(query): try: all_results = [] expanded_terms = get_expanded_terms(query) for term in tqdm(expanded_terms, desc="πŸ“’β—πŸš¨ Fetching Google AutoSuggestions", unit="term"): results = get_results(term) if results: formatted_results = format_results(results) all_results += formatted_results all_results = sorted(all_results, key=lambda k: k.get('relevance', 0), reverse=True) return all_results except Exception as e: logger.error(f"Error in get_expanded_suggestions: {e}") return [] def get_suggestions_for_keyword(search_term): """ """ try: expanded_results = get_expanded_suggestions(search_term) expanded_results_df = pd.DataFrame(expanded_results) expanded_results_df.columns = ['Keywords', 'Relevance'] #expanded_results_df.to_csv('results.csv', index=False) pd.set_option('display.max_rows', expanded_results_df.shape[0]+1) expanded_results_df.drop_duplicates('Keywords', inplace=True) table = tabulate(expanded_results_df, headers=['Keywords', 'Relevance'], tablefmt='fancy_grid') # FIXME: Too much data for LLM context window. We will need to embed it. #try: # save_in_file(table) #except Exception as save_results_err: # logger.error(f"Failed to save search results: {save_results_err}") return expanded_results_df except Exception as e: logger.error(f"get_suggestions_for_keyword: Error in main: {e}") def perform_keyword_clustering(expanded_results_df, num_clusters=5): try: # Preprocessing: Convert the keywords to lowercase expanded_results_df['Keywords'] = expanded_results_df['Keywords'].str.lower() # Vectorization: Create a TF-IDF vectorizer vectorizer = TfidfVectorizer() # Fit the vectorizer to the keywords tfidf_vectors = vectorizer.fit_transform(expanded_results_df['Keywords']) # Applying K-means clustering kmeans = KMeans(n_clusters=num_clusters, random_state=42) cluster_labels = kmeans.fit_predict(tfidf_vectors) # Add cluster labels to the DataFrame expanded_results_df['cluster_label'] = cluster_labels # Assessing cluster quality through silhouette score silhouette_avg = silhouette_score(tfidf_vectors, cluster_labels) print(f"Silhouette Score: {silhouette_avg}") # Visualize cluster quality using a silhouette plot #visualize_silhouette(tfidf_vectors, cluster_labels) return expanded_results_df except Exception as e: logger.error(f"Error in perform_keyword_clustering: {e}") return pd.DataFrame() def visualize_silhouette(X, labels): try: silhouette_avg = silhouette_score(X, labels) print(f"Silhouette Score: {silhouette_avg}") # Create a subplot with 1 row and 2 columns fig, ax1 = plt.subplots(1, 1, figsize=(8, 6)) # The 1st subplot is the silhouette plot ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, X.shape[0] + (len(set(labels)) + 1) * 10]) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, labels) y_lower = 10 for i in set(labels): # Aggregate the silhouette scores for samples belonging to the cluster ith_cluster_silhouette_values = sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = plt.cm.nipy_spectral(float(i) / len(set(labels))) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for the next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("Silhouette plot for KMeans clustering") ax1.set_xlabel("Silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for the average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") plt.show() except Exception as e: logger.error(f"Error in visualize_silhouette: {e}") def print_and_return_top_keywords(expanded_results_df, num_clusters=5): """ Display and return top keywords in each cluster. Args: expanded_results_df (pd.DataFrame): DataFrame containing expanded keywords, relevance, and cluster labels. num_clusters (int or str): Number of clusters or 'all'. Returns: pd.DataFrame: DataFrame with top keywords for each cluster. """ top_keywords_df = pd.DataFrame() if num_clusters == 'all': unique_clusters = expanded_results_df['cluster_label'].unique() else: unique_clusters = range(int(num_clusters)) for i in unique_clusters: cluster_df = expanded_results_df[expanded_results_df['cluster_label'] == i] top_keywords = cluster_df.sort_values(by='Relevance', ascending=False).head(5) top_keywords_df = pd.concat([top_keywords_df, top_keywords]) print(f"\nπŸ“’β—πŸš¨ GTop Keywords for All Clusters:") table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid') # Save the combined table to a file try: save_in_file(table) except Exception as save_results_err: logger.error(f"🚨 Failed to save search results: {save_results_err}") print(table) return top_keywords_df def generate_wordcloud(keywords): """ Generate and display a word cloud from a list of keywords. Args: keywords (list): List of keywords. """ # Convert the list of keywords to a string text = ' '.join(keywords) # Generate word cloud wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) # Display the word cloud using matplotlib plt.figure(figsize=(600, 200)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() def save_in_file(table_content): """ Helper function to save search analysis in a file. """ file_path = os.environ.get('SEARCH_SAVE_FILE') try: # Save the content to the file with open(file_path, "a+", encoding="utf-8") as file: file.write(table_content) file.write("\n" * 3) # Add three newlines at the end logger.info(f"Search content saved to {file_path}") except Exception as e: logger.error(f"Error occurred while writing to the file: {e}") def do_google_trends_analysis(search_term): """ Get a google search keywords, get its stats.""" search_term = [f"{search_term}"] all_the_keywords = [] try: for asearch_term in search_term: #FIXME: Lets work with a single root keyword. suggestions_df = get_suggestions_for_keyword(asearch_term) if len(suggestions_df['Keywords']) > 10: result_df = perform_keyword_clustering(suggestions_df) # Display top keywords in each cluster top_keywords = print_and_return_top_keywords(result_df) all_the_keywords.append(top_keywords['Keywords'].tolist()) else: all_the_keywords.append(suggestions_df['Keywords'].tolist()) all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords]) # Generate a random sleep time between 2 and 3 seconds time.sleep(random.uniform(2, 3)) # Display additional information try: result_df = get_related_topics_and_save_csv(search_term) logger.info(f"Related topics:: result_df: {result_df}") # Extract 'Top' topic_title if result_df: top_topic_title = result_df['top']['topic_title'].values.tolist() # Join each sublist into one string separated by comma #top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title] top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title]) except Exception as err: logger.error(f"Failed to get results from google trends related topics: {err}") # TBD: Not getting great results OR unable to understand them. #all_the_keywords += top_topic_title all_the_keywords = all_the_keywords.split(',') # Split the list into chunks of 5 keywords chunk_size = 4 chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)] # Create a DataFrame with columns named 'Keyword 1', 'Keyword 2', etc. combined_df = pd.DataFrame(chunks, columns=[f'KπŸ“’eyword Col{i + 1}' for i in range(chunk_size)]) # Print the table table = tabulate(combined_df, headers='keys', tablefmt='fancy_grid') # Save the combined table to a file try: save_in_file(table) except Exception as save_results_err: logger.error(f"Failed to save search results: {save_results_err}") print(table) #generate_wordcloud(all_the_keywords) return(all_the_keywords) except Exception as e: logger.error(f"Error in Google Trends Analysis: {e}") def get_trending_searches(country='united_states'): """Get trending searches for a specific country.""" try: pytrends = TrendReq(hl='en-US', tz=360) trending_searches = pytrends.trending_searches(pn=country) return trending_searches except Exception as e: logger.error(f"Error getting trending searches: {e}") return pd.DataFrame() def get_realtime_trends(country='US'): """Get realtime trending searches for a specific country.""" try: pytrends = TrendReq(hl='en-US', tz=360) realtime_trends = pytrends.realtime_trending_searches(pn=country) return realtime_trends except Exception as e: logger.error(f"Error getting realtime trends: {e}") return pd.DataFrame()