ollama client can now call multimodal models

This commit is contained in:
Michael Peter Christen
2025-03-25 22:57:18 +01:00
parent 39518e8647
commit 556e67b2c7
3 changed files with 70 additions and 4 deletions

BIN
llmtest/testimage.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

View File

@@ -1,10 +1,14 @@
import os
import json
import time
import base64
import urllib3
import requests
from PIL import Image
from io import BytesIO
from argparse import ArgumentParser
def ollama_list(api_base='http://localhost:11434'):
# call api http://localhost:11434/api/tags with http get request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -45,7 +49,10 @@ def ollama_chat_endpoint(api_base='http://localhost:11434', model_name='llama3.2
}
return endpoint
def ollama_chat(endpoint, prompt='Hello World', temperature=0.0, max_tokens=32768):
def hex2base64(hex_string):
return base64.b64encode(bytes.fromhex(hex_string)).decode('utf-8')
def ollama_chat(endpoint, prompt='Hello World', base64_image=None, temperature=0.0, max_tokens=32768):
# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -68,12 +75,38 @@ def ollama_chat(endpoint, prompt='Hello World', temperature=0.0, max_tokens=3276
if modelname.startswith("o1") or modelname.startswith("gpt-o1"):
temperature = 1.0 # o1 models need temperature 1.0
else:
messages.append({"content": "You are a helpful assistant", "role": "system"})
messages.append({"role": "system", "content": "You are a helpful assistant"})
if modelname.startswith("4o") or modelname.startswith("gpt-4o") or modelname.startswith("gpt-3.5"):
# reduce number of stoptokes to 4
stoptokens = ["[/INST]", "<|im_end|>", "<|end_of_turn|>", "<|eot_id|>"]
messages.append({"role": "user", "content": prompt})
if base64_image:
image_type = "jpeg"
#base64_magic = {"/9j/": "jpeg", "iVBO": "png", "Qk": "bmp", "R0lG": "gif", "SUkq": "tiff", "SUkr": "tiff", "TU0A": "tiff", "GkXf": "webp", "UklG": "webp"}
base64_magic = {"/9j/": "jpeg", "iVBO": "png", "R0lG": "gif"} # only jpeg and png are allowed as data type; however all of the types above (but gif!) are supported by the API
for magic, itype in base64_magic.items():
if base64_image.startswith(magic):
#print(f"Detected {itype} image")
image_type = itype
break
# If this is a gif we must convert it to png
if image_type == "gif":
#print("Converting gif to png")
image = Image.open(BytesIO(base64.b64decode(base64_image)))
png_image = BytesIO()
image.save(png_image, format="PNG")
base64_image = base64.b64encode(png_image.getvalue()).decode('utf-8')
image_type = "png"
# Add the image to the message
image_url_object = {"url": f"data:image/{image_type};base64,{base64_image}"}
usermessage = {"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": image_url_object}
]}
else:
usermessage = {"role": "user", "content": prompt}
messages.append(usermessage)
if modelname.startswith("o1") or modelname.startswith("4o"):
stoptokens = []
@@ -129,17 +162,31 @@ def ollama_chat(endpoint, prompt='Hello World', temperature=0.0, max_tokens=3276
except json.JSONDecodeError as e:
raise Exception(f"Failed to parse JSON response from the API: {e}")
def test_multimodal(endpoint):
image_path = "llmtest/testimage.png"
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
try:
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
if "42" in answer:
return True
return False
except Exception as e:
return False
def main():
parser = ArgumentParser(description="Testing the ollama API.")
parser.add_argument('--api_base', required=False, default='http://localhost:11434', help='API base URL for the LLM, default is http://localhost:11434')
parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
parser.add_argument('--image', required=False, default=None, help='path to an image that shall be processed')
# parse the arguments
args = parser.parse_args()
api_base = args.api_base
endpoint_name = args.endpoint
model_name = args.model
image_path = args.image
# load the endpoint file
endpoint = {}
@@ -154,11 +201,29 @@ def main():
else:
endpoint = ollama_chat_endpoint(api_base, model_name)
# test if the endpoint is a multimodal model
if test_multimodal(endpoint):
print("Endpoint is a multimodal model.")
else:
print("Endpoint is not a multimodal model.")
# load the image, if a path is given
base64_image = None
if image_path:
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
# access the ollama API
models_dict = ollama_list()
for (model, attr) in models_dict.items():
print(f"Model: {model}")
try:
if base64_image:
answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
else:
answer, total_tokens, token_per_second = ollama_chat(endpoint)
except Exception as e:
answer = f"Error: {str(e)}"
print(answer)
if __name__ == "__main__":

View File

@@ -1,3 +1,4 @@
PIL
sympy
urllib3
requests