From 8bd9ce3bebd74048305d8595db39707488254b14 Mon Sep 17 00:00:00 2001
From: Andre K <andre.kamarudin@gmail.com>
Date: Thu, 16 Apr 2026 08:42:29 +0800
Subject: [PATCH] feat: add image/photo support for vision-capable LLM requests

---
 main.py | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/main.py b/main.py
index bf350c5..4b3fb66 100644
--- a/main.py
+++ b/main.py
@@ -3,14 +3,15 @@
 
 from __future__ import annotations
 
+import base64
 import json
 import logging
 import os
 import pathlib
 import subprocess
 
-from openai import OpenAI
 import telegram
+from openai import OpenAI
 from telegram import BotCommand, Update
 from telegram.error import BadRequest, TimedOut
 from telegram.ext import (
@@ -308,9 +309,8 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None
     if not _is_authorized(update.effective_user.id):
         return
     chat_id = update.effective_chat.id
-    user_text = update.message.text
-    if not user_text:
-        return
+    # Text can come from message.text (plain) or message.caption (photo)
+    user_text = update.message.text or update.message.caption or ""
 
     # In group chats, only respond if bot is mentioned or replied to
     if update.effective_chat.type in ("group", "supergroup"):
@@ -326,11 +326,37 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None
         # Strip the @mention from the text
         if bot_username:
             user_text = user_text.replace(f"@{bot_username}", "").strip()
-        if not user_text:
-            return
+
+    # Download photo if present
+    image_b64: str | None = None
+    if update.message.photo:
+        try:
+            photo = update.message.photo[-1]  # highest resolution
+            file = await ctx.bot.get_file(photo.file_id)
+            data = await file.download_as_bytearray()
+            image_b64 = base64.b64encode(bytes(data)).decode()
+            log.info("Downloaded photo: %d bytes", len(data))
+        except Exception as e:
+            log.error("Failed to download photo: %s", e)
+
+    if not user_text and not image_b64:
+        return
+
+    # Build user message content (text-only or multipart with image)
+    if image_b64:
+        content_parts: list[dict] = []
+        if user_text:
+            content_parts.append({"type": "text", "text": user_text})
+        content_parts.append({
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
+        })
+        user_content: str | list[dict] = content_parts
+    else:
+        user_content = user_text
 
     messages = get_messages(chat_id)
-    messages.append({"role": "user", "content": user_text})
+    messages.append({"role": "user", "content": user_content})
 
     # Send immediate "Thinking ..." placeholder so user knows the bot read their message
     thinking = await _safe_reply(update, "Thinking ...")
@@ -432,7 +458,9 @@ def main() -> None:
     app = ApplicationBuilder().token(TG_BOT_TOKEN).build()
     app.add_handler(CommandHandler("start", cmd_start))
     app.add_handler(CommandHandler("reset", cmd_reset))
-    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
+    app.add_handler(MessageHandler(
+        (filters.TEXT | filters.PHOTO) & ~filters.COMMAND, handle_message
+    ))
     app.add_error_handler(_error_handler)
 
     # Register bot commands for the / menu