feat: add image/photo support for vision-capable LLM requests

2026-04-16 08:42:29 +08:00 · 2026-04-16 08:42:29 +08:00 · 8bd9ce3beb
commit 8bd9ce3beb
parent 9316a38699
1 changed files with 36 additions and 8 deletions
--- a/main.py
+++ b/main.py
@ -3,14 +3,15 @@
 from __future__ import annotations
 import base64
 import json
 import logging
 import os
 import pathlib
 import subprocess
 from openai import OpenAI
 import telegram
 from openai import OpenAI
 from telegram import BotCommand, Update
 from telegram.error import BadRequest, TimedOut
 from telegram.ext import (
@ -308,9 +309,8 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None
    if not _is_authorized(update.effective_user.id):
        return
    chat_id = update.effective_chat.id
-    user_text = update.message.text
+    # Text can come from message.text (plain) or message.caption (photo)
-    if not user_text:
+    user_text = update.message.text or update.message.caption or ""
        return
    # In group chats, only respond if bot is mentioned or replied to
    if update.effective_chat.type in ("group", "supergroup"):
@ -326,11 +326,37 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None
        # Strip the @mention from the text
        if bot_username:
            user_text = user_text.replace(f"@{bot_username}", "").strip()
-        if not user_text:
+
    # Download photo if present
    image_b64: str | None = None
    if update.message.photo:
        try:
            photo = update.message.photo[-1]  # highest resolution
            file = await ctx.bot.get_file(photo.file_id)
            data = await file.download_as_bytearray()
            image_b64 = base64.b64encode(bytes(data)).decode()
            log.info("Downloaded photo: %d bytes", len(data))
        except Exception as e:
            log.error("Failed to download photo: %s", e)
    if not user_text and not image_b64:
        return
    # Build user message content (text-only or multipart with image)
    if image_b64:
        content_parts: list[dict] = []
        if user_text:
            content_parts.append({"type": "text", "text": user_text})
        content_parts.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
        })
        user_content: str | list[dict] = content_parts
    else:
        user_content = user_text
    messages = get_messages(chat_id)
-    messages.append({"role": "user", "content": user_text})
+    messages.append({"role": "user", "content": user_content})
    # Send immediate "Thinking ..." placeholder so user knows the bot read their message
    thinking = await _safe_reply(update, "Thinking ...")
@ -432,7 +458,9 @@ def main() -> None:
    app = ApplicationBuilder().token(TG_BOT_TOKEN).build()
    app.add_handler(CommandHandler("start", cmd_start))
    app.add_handler(CommandHandler("reset", cmd_reset))
-    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
+    app.add_handler(MessageHandler(
        (filters.TEXT | filters.PHOTO) & ~filters.COMMAND, handle_message
    ))
    app.add_error_handler(_error_handler)
    # Register bot commands for the / menu