Feat: Add Gemini AI for Reddit thread summarization

This commit introduces the capability to summarize Reddit thread content (title + selftext) using the Google Gemini API. Key changes: - Added `google-generativeai` to `requirements.txt`. - Updated `utils/.config.template.toml` and `utils/settings.py` to include a new `[gemini]` configuration section for `api_key` and `enable_summary`. Includes interactive prompt for API key if summarization is enabled but key is missing. - Created `utils/gemini_client.py` module: - `initialize_gemini()`: Configures the Gemini client with API key and initializes the `gemini-1.5-pro-latest` model (configurable via settings). - `summarize_text_with_gemini()`: Takes text, generates a prompt, calls the Gemini API, and returns the summary. Includes error handling for API calls and content safety. - Integrated into `main.py`: - Gemini client is initialized at startup if enabled. - In `generate_audio_and_screenshots()`, if summarization is enabled, the thread's selftext is replaced with the Gemini-generated summary before being passed to TTS. Falls back to original text on error. - Added relevant logging and documentation for the new feature.
5 months ago · 1cb4be234a
parent 47612a09ac
commit 1cb4be234a
5 changed files with 254 additions and 5 deletions
--- a/main.py
+++ b/main.py
@ -29,6 +29,7 @@ from utils.ffmpeg_install import ffmpeg_install
 # Assuming it was for generating a unique ID from the reddit object,
 # this functionality will be implicitly handled by using reddit_object["thread_id"]
 from utils.version import checkversion
+from utils.gemini_client import summarize_text_with_gemini # Added for summarization
 from video_creation.background import (
    chop_background,
    download_background_audio,
@ -140,9 +141,45 @@ def get_reddit_data(post_id_override: str = None) -> Dict[str, Any]:
    return reddit_object

 def generate_audio_and_screenshots(reddit_object: Dict[str, Any]) -> Tuple[int, int]:
-    """Generates TTS audio for the reddit content and takes screenshots."""
-    logging.info("Generating audio and screenshots...")
-    length, number_of_comments = save_text_to_mp3(reddit_object)
+    """
+    Generates TTS audio for the Reddit content (potentially summarized) and takes screenshots.
+    """
+    logging.info("Preparing content for audio and screenshots...")
+
+    # --- Gemini Summarization Step ---
+    if settings.config.get("gemini", {}).get("enable_summary"):
+        logging.info("Gemini summarization enabled. Attempting to summarize thread content.")
+        # Construct text for summarization: title + selftext
+        # Ensure 'thread_title' and 'thread_selftext' exist. 'thread_selftext' might be empty for image/link posts.
+        title = reddit_object.get("thread_title", "")
+        selftext = reddit_object.get("thread_selftext", "") # This is what TTSEngine uses for the main post
+
+        if selftext and selftext.strip(): # Only summarize if there's actual selftext
+            text_to_summarize = f"Titel: {title}\n\nBericht:\n{selftext}"
+            logging.debug(f"Text to summarize (first 200 chars): {text_to_summarize[:200]}")
+
+            summary = summarize_text_with_gemini(text_to_summarize)
+            if summary:
+                logging.info("Successfully summarized thread content with Gemini.")
+                # Replace the original selftext with the summary for TTS
+                # The TTSEngine in voices.py specifically looks for 'thread_selftext' for the main post.
+                reddit_object["thread_selftext"] = summary
+                # The title is usually read out separately by TTSEngine.
+                # If the summary should also include/replace the title for TTS, this logic might need adjustment
+                # or the prompt to Gemini could be to make the summary inclusive of the title's context.
+                # Current prompt: "Vat de volgende Reddit-thread samen in een boeiend en beknopt verhaal..."
+                # This implies the summary might naturally incorporate the title's essence.
+                logging.debug(f"Using Gemini summary for TTS (first 100 chars): {summary[:100]}")
+            else:
+                logging.warning("Failed to get summary from Gemini, or summary was empty. Using original selftext.")
+        else:
+            logging.info("No selftext found or selftext is empty. Skipping Gemini summarization for this post.")
+    else:
+        logging.info("Gemini summarization is not enabled.")
+    # --- End Gemini Summarization ---
+
+    logging.info("Proceeding to generate TTS audio and screenshots with (potentially summarized) content...")
+    length, number_of_comments = save_text_to_mp3(reddit_object) # save_text_to_mp3 uses reddit_object["thread_selftext"]
    final_length = math.ceil(length)
    get_screenshots_of_reddit_posts(reddit_object, number_of_comments)
    logging.info("Audio and screenshots generated.")
@ -238,6 +275,20 @@ if __name__ == "__main__":

    app_config = initialize_app_checks_and_config()

+    # Initialize Gemini client if enabled
+    if app_config.get("gemini", {}).get("enable_summary"):
+        try:
+            from utils.gemini_client import initialize_gemini
+            if not initialize_gemini():
+                logging.warning("Gemini client initialization failed or was skipped. Summarization will not be available.")
+            else:
+                logging.info("Gemini client initialized for summarization.")
+        except ImportError:
+            logging.error("Failed to import gemini_client. Summarization will not be available. Ensure google-generativeai is installed.")
+        except Exception as e:
+            logging.error(f"An unexpected error occurred during Gemini initialization: {e}", exc_info=True)
+
+
    try:
        post_ids_str = app_config.get("reddit", {}).get("thread", {}).get("post_id")
        times_to_run = app_config.get("settings", {}).get("times_to_run")
--- a/requirements.txt
+++ b/requirements.txt
@ -26,3 +26,4 @@ tiktok-uploader==1.1.1
 google-api-python-client==2.171.0
 google-auth-oauthlib==1.2.2
 google-auth-httplib2==0.2.0
+google-generativeai>=0.3.0 # Added for Gemini
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -56,3 +56,8 @@ python_voice = { optional = false, default = "1", example = "1", explanation = "
 py_voice_num = { optional = false, default = "2", example = "2", explanation = "The number of system voices (2 are pre-installed in Windows)" }
 silence_duration = { optional = true, example = "0.1", explanation = "Time in seconds between TTS comments", default = 0.3, type = "float" }
 no_emojis = { optional = false, type = "bool", default = false, example = false, options = [true, false,], explanation = "Whether to remove emojis from the comments" }
+
+[gemini] # New section for Gemini AI settings
+api_key = { optional = true, default = "", type = "str", explanation = "Your Google Gemini API Key. Required if enable_summary is true.", example = "AIzaxxxxxxxxxxxxxxxxxxxxxxx" } # Ensure type is str
+enable_summary = { optional = true, type = "bool", default = false, example = false, options = [true, false,], explanation = "Enable summarizing Reddit thread selftext using Gemini AI." }
+# Future Gemini settings could go here, e.g., model_name, specific prompts, etc.
--- a/utils/gemini_client.py
+++ b/utils/gemini_client.py
@ -0,0 +1,158 @@
+# utils/gemini_client.py
+"""
+Client module for interacting with the Google Gemini API.
+
+Provides functionalities to initialize the Gemini client and to perform
+specific generative AI tasks, such as text summarization.
+The API key and model preferences are typically read from the application settings.
+"""
+import logging
+import google.generativeai as genai
+from google.api_core import exceptions as google_exceptions # For more specific error handling
+
+from utils import settings # To access API key and other settings
+
+logger = logging.getLogger(__name__)
+
+# Global variable to hold the initialized model, to avoid re-initializing if not necessary
+# However, for a simple function call per summary, initializing model inside might be fine too.
+# Let's try initializing it once.
+_gemini_model = None
+_gemini_initialized = False
+
+def initialize_gemini() -> bool:
+    """
+    Initializes the Google Gemini client with the API key from settings.
+    Should be called once at the start of the application if Gemini features are enabled.
+
+    Returns:
+        bool: True if initialization was successful or already initialized, False otherwise.
+    """
+    global _gemini_model, _gemini_initialized
+
+    if _gemini_initialized:
+        logger.debug("Gemini client already initialized.")
+        return True
+
+    if not settings.config.get("gemini", {}).get("enable_summary", False):
+        logger.info("Gemini summary is not enabled in settings. Skipping Gemini client initialization.")
+        return False # Not an error, but not initialized for summarization
+
+    api_key = settings.config.get("gemini", {}).get("api_key")
+    if not api_key:
+        logger.error("Gemini API key not found in settings. Cannot initialize Gemini client.")
+        return False
+
+    try:
+        logger.info("Configuring Google Gemini API...")
+        genai.configure(api_key=api_key)
+        # Use gemini-1.5-pro-latest as requested
+        model_name = settings.config.get("gemini", {}).get("model_name", "gemini-1.5-pro-latest")
+        logger.info(f"Attempting to initialize Gemini model: {model_name}")
+        _gemini_model = genai.GenerativeModel(model_name)
+        _gemini_initialized = True
+        logger.info(f"Google Gemini client initialized successfully with model: {model_name}.")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to initialize Google Gemini client: {e}", exc_info=True)
+        _gemini_initialized = False # Explicitly mark as not initialized on error
+        _gemini_model = None
+        return False
+
+
+def summarize_text_with_gemini(text_to_summarize: str) -> str | None:
+    """
+    Summarizes the given text using the Google Gemini API (gemini-1.0-pro model).
+
+    Args:
+        text_to_summarize (str): The text content to be summarized.
+
+    Returns:
+        Optional[str]: The summarized text, or None if summarization fails or Gemini is not enabled/initialized.
+    """
+    global _gemini_model, _gemini_initialized
+
+    if not _gemini_initialized or _gemini_model is None:
+        # Attempt to initialize if not done yet (e.g. if called directly without main.py's init)
+        # However, it's better if initialize_gemini() is called explicitly once.
+        # For robustness, let's check and log.
+        logger.warning("Gemini client not initialized or model not available. Call initialize_gemini() first.")
+        # Optionally, could try to initialize here:
+        # if not initialize_gemini():
+        #     return None
+        # But for now, require prior initialization.
+        return None
+
+    if not text_to_summarize or not text_to_summarize.strip():
+        logger.warning("Text to summarize is empty. Skipping Gemini call.")
+        return None
+
+    # Consider token limits of Gemini Pro (e.g., 30720 input tokens)
+    # If text_to_summarize is very long, it might need truncation or chunking.
+    # For now, sending the whole text.
+    # A simple truncation:
+    # MAX_INPUT_LENGTH_APPROX = 28000 # Leave some room for prompt and overhead
+    # if len(text_to_summarize) > MAX_INPUT_LENGTH_APPROX:
+    #     logger.warning(f"Input text length ({len(text_to_summarize)}) is very long. Truncating to {MAX_INPUT_LENGTH_APPROX} characters for Gemini.")
+    #     text_to_summarize = text_to_summarize[:MAX_INPUT_LENGTH_APPROX]
+
+
+    prompt = f"Vat de volgende Reddit-thread samen in een boeiend en beknopt verhaal. Geef alleen de samenvatting terug, zonder extra inleidende of afsluitende zinnen:\n\n{text_to_summarize}"
+    logger.info(f"Sending text (length: {len(text_to_summarize)}) to Gemini for summarization.")
+    logger.debug(f"Gemini prompt (first 100 chars of text): Vat de volgende Reddit-thread samen... {text_to_summarize[:100]}...")
+
+    try:
+        # Safety settings can be added here if needed, e.g.
+        # safety_settings = [
+        #     {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+        #     {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+        #     {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
+        #     {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+        # ]
+        # response = _gemini_model.generate_content(prompt, safety_settings=safety_settings)
+        response = _gemini_model.generate_content(prompt)
+
+        if response.parts:
+            summary = response.text # .text joins parts automatically
+            logger.info("Successfully received summary from Gemini.")
+            logger.debug(f"Gemini summary: {summary[:100]}...") # Log beginning of summary
+            return summary
+        else:
+            # This might happen if content is blocked by safety filters without raising an error in parts
+            logger.warning("Gemini response contained no parts (summary might be empty or blocked).")
+            if response.prompt_feedback and response.prompt_feedback.block_reason:
+                logger.warning(f"Gemini content generation blocked. Reason: {response.prompt_feedback.block_reason_message or response.prompt_feedback.block_reason}")
+            return None
+
+    except google_exceptions.RetryError as e:
+        logger.error(f"Gemini API request failed after retries (RetryError): {e}", exc_info=True)
+    except google_exceptions.GoogleAPIError as e: # General Google API error
+        logger.error(f"Gemini API request failed (GoogleAPIError): {e}", exc_info=True)
+    except Exception as e: # Catch any other exceptions
+        logger.error(f"An unexpected error occurred while calling Gemini API: {e}", exc_info=True)
+
+    return None
+
+# Example usage (for testing this module directly, if needed)
+if __name__ == '__main__':
+    # This example requires settings to be mocked or a dummy config for testing.
+    # For now, this is just a placeholder.
+    print("Testing gemini_client.py (requires manual setup of config or mocking)")
+
+    # Dummy config for direct testing:
+    # settings.config = {
+    #     "gemini": {
+    #         "enable_summary": True,
+    #         "api_key": "YOUR_ACTUAL_API_KEY_FOR_TESTING_ONLY",
+    #     }
+    # }
+    # if initialize_gemini():
+    #     sample_text = "Dit is een lange testtekst over een Reddit thread die heel interessant was. Het ging over katten en honden en hoe ze soms wel en soms niet met elkaar overweg kunnen. De gebruiker vroeg om advies omdat zijn kat en hond constant ruzie maakten. Er waren veel commentaren met goede tips."
+    #     summary = summarize_text_with_gemini(sample_text)
+    #     if summary:
+    #         print("\n--- Summary ---")
+    #         print(summary)
+    #     else:
+    #         print("\nFailed to get summary.")
+    # else:
+    #     print("Failed to initialize Gemini for testing.")
--- a/utils/settings.py
+++ b/utils/settings.py
@ -260,9 +260,43 @@ Overwrite with a fresh configuration based on the template? (y/n)[/blue]""")
 If you see any prompts, that means that you have unset/incorrectly set variables, please input the correct values.\
 """
    )
-    crawl(template, check_vars)
-    with open(config_file, "w") as f:
+    crawl(template, check_vars) # Populates global `config` by validating against template
+
+    # --- Custom validation for Gemini settings ---
+    if "gemini" in config and isinstance(config["gemini"], dict):
+        gemini_settings = config["gemini"]
+        if gemini_settings.get("enable_summary") is True:
+            if not gemini_settings.get("api_key"):
+                logger.warning("Gemini summary is enabled, but API key is missing.")
+                # Prompt user for API key if missing and summary is enabled
+                # This uses the `handle_input` which is part of `utils.console`
+                # and uses Rich Console for prompting.
+                gemini_api_key_checks = template.get("gemini", {}).get("api_key", {})
+                # Ensure the 'type' is 'str' for handle_input if not otherwise specified for this direct call
+                gemini_api_key_checks.setdefault("type", "str")
+
+                # We need a local Rich Console instance if handle_input relies on a global one not set here.
+                # However, handle_input itself creates a Console instance.
+
+                api_key_value = handle_input(
+                    message="[#C0CAF5 bold]Gemini API Key ([red]required as summary is enabled[/#C0CAF5 bold]): ",
+                    extra_info=gemini_api_key_checks.get("explanation", "Enter your Google Gemini API Key."),
+                    check_type=_get_safe_type_converter(gemini_api_key_checks.get("type", "str")), # Pass callable
+                    optional=False, # It's not optional if enable_summary is true
+                    err_message="API Key cannot be empty when summary is enabled."
+                    # Potentially add nmin for basic validation if desired
+                )
+                config["gemini"]["api_key"] = api_key_value
+                if not api_key_value: # Double check if user somehow bypassed
+                    logger.error("Gemini API Key is required when enable_summary is true. Configuration incomplete.")
+                    return False # Indicate failure
+            else:
+                logger.debug("Gemini summary enabled and API key is present.")
+    # --- End custom validation ---
+
+    with open(config_file, "w", encoding="utf-8") as f: # Ensure encoding
        toml.dump(config, f)
+    logger.info(f"Configuration successfully checked and saved to {config_file}")
    return config