From 8bd9ce3bebd74048305d8595db39707488254b14 Mon Sep 17 00:00:00 2001 From: Andre K Date: Thu, 16 Apr 2026 08:42:29 +0800 Subject: [PATCH] feat: add image/photo support for vision-capable LLM requests --- main.py | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index bf350c5..4b3fb66 100644 --- a/main.py +++ b/main.py @@ -3,14 +3,15 @@ from __future__ import annotations +import base64 import json import logging import os import pathlib import subprocess -from openai import OpenAI import telegram +from openai import OpenAI from telegram import BotCommand, Update from telegram.error import BadRequest, TimedOut from telegram.ext import ( @@ -308,9 +309,8 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None if not _is_authorized(update.effective_user.id): return chat_id = update.effective_chat.id - user_text = update.message.text - if not user_text: - return + # Text can come from message.text (plain) or message.caption (photo) + user_text = update.message.text or update.message.caption or "" # In group chats, only respond if bot is mentioned or replied to if update.effective_chat.type in ("group", "supergroup"): @@ -326,11 +326,37 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None # Strip the @mention from the text if bot_username: user_text = user_text.replace(f"@{bot_username}", "").strip() - if not user_text: - return + + # Download photo if present + image_b64: str | None = None + if update.message.photo: + try: + photo = update.message.photo[-1] # highest resolution + file = await ctx.bot.get_file(photo.file_id) + data = await file.download_as_bytearray() + image_b64 = base64.b64encode(bytes(data)).decode() + log.info("Downloaded photo: %d bytes", len(data)) + except Exception as e: + log.error("Failed to download photo: %s", e) + + if not user_text and not image_b64: + return + + # Build user message content (text-only or multipart with image) + if image_b64: + content_parts: list[dict] = [] + if user_text: + content_parts.append({"type": "text", "text": user_text}) + content_parts.append({ + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}, + }) + user_content: str | list[dict] = content_parts + else: + user_content = user_text messages = get_messages(chat_id) - messages.append({"role": "user", "content": user_text}) + messages.append({"role": "user", "content": user_content}) # Send immediate "Thinking ..." placeholder so user knows the bot read their message thinking = await _safe_reply(update, "Thinking ...") @@ -432,7 +458,9 @@ def main() -> None: app = ApplicationBuilder().token(TG_BOT_TOKEN).build() app.add_handler(CommandHandler("start", cmd_start)) app.add_handler(CommandHandler("reset", cmd_reset)) - app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)) + app.add_handler(MessageHandler( + (filters.TEXT | filters.PHOTO) & ~filters.COMMAND, handle_message + )) app.add_error_handler(_error_handler) # Register bot commands for the / menu