feat: add image/photo support for vision-capable LLM requests

This commit is contained in:
Andre Kamarudin 2026-04-16 08:42:29 +08:00
parent 9316a38699
commit 8bd9ce3beb

42
main.py
View file

@ -3,14 +3,15 @@
from __future__ import annotations from __future__ import annotations
import base64
import json import json
import logging import logging
import os import os
import pathlib import pathlib
import subprocess import subprocess
from openai import OpenAI
import telegram import telegram
from openai import OpenAI
from telegram import BotCommand, Update from telegram import BotCommand, Update
from telegram.error import BadRequest, TimedOut from telegram.error import BadRequest, TimedOut
from telegram.ext import ( from telegram.ext import (
@ -308,9 +309,8 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None
if not _is_authorized(update.effective_user.id): if not _is_authorized(update.effective_user.id):
return return
chat_id = update.effective_chat.id chat_id = update.effective_chat.id
user_text = update.message.text # Text can come from message.text (plain) or message.caption (photo)
if not user_text: user_text = update.message.text or update.message.caption or ""
return
# In group chats, only respond if bot is mentioned or replied to # In group chats, only respond if bot is mentioned or replied to
if update.effective_chat.type in ("group", "supergroup"): if update.effective_chat.type in ("group", "supergroup"):
@ -326,11 +326,37 @@ async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE) -> None
# Strip the @mention from the text # Strip the @mention from the text
if bot_username: if bot_username:
user_text = user_text.replace(f"@{bot_username}", "").strip() user_text = user_text.replace(f"@{bot_username}", "").strip()
if not user_text:
# Download photo if present
image_b64: str | None = None
if update.message.photo:
try:
photo = update.message.photo[-1] # highest resolution
file = await ctx.bot.get_file(photo.file_id)
data = await file.download_as_bytearray()
image_b64 = base64.b64encode(bytes(data)).decode()
log.info("Downloaded photo: %d bytes", len(data))
except Exception as e:
log.error("Failed to download photo: %s", e)
if not user_text and not image_b64:
return return
# Build user message content (text-only or multipart with image)
if image_b64:
content_parts: list[dict] = []
if user_text:
content_parts.append({"type": "text", "text": user_text})
content_parts.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
})
user_content: str | list[dict] = content_parts
else:
user_content = user_text
messages = get_messages(chat_id) messages = get_messages(chat_id)
messages.append({"role": "user", "content": user_text}) messages.append({"role": "user", "content": user_content})
# Send immediate "Thinking ..." placeholder so user knows the bot read their message # Send immediate "Thinking ..." placeholder so user knows the bot read their message
thinking = await _safe_reply(update, "Thinking ...") thinking = await _safe_reply(update, "Thinking ...")
@ -432,7 +458,9 @@ def main() -> None:
app = ApplicationBuilder().token(TG_BOT_TOKEN).build() app = ApplicationBuilder().token(TG_BOT_TOKEN).build()
app.add_handler(CommandHandler("start", cmd_start)) app.add_handler(CommandHandler("start", cmd_start))
app.add_handler(CommandHandler("reset", cmd_reset)) app.add_handler(CommandHandler("reset", cmd_reset))
app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)) app.add_handler(MessageHandler(
(filters.TEXT | filters.PHOTO) & ~filters.COMMAND, handle_message
))
app.add_error_handler(_error_handler) app.add_error_handler(_error_handler)
# Register bot commands for the / menu # Register bot commands for the / menu