ollama client can now call multimodal models

2025-03-25 22:57:18 +01:00
parent 39518e8647
commit 556e67b2c7
3 changed files with 70 additions and 4 deletions
--- a/llmtest/testimage.png
+++ b/llmtest/testimage.png
--- a/ollama_client.py
+++ b/ollama_client.py
@@ -1,10 +1,14 @@
 import os
 import json
 import time
 import base64
 import urllib3
 import requests
 from PIL import Image
 from io import BytesIO
 from argparse import ArgumentParser
 def ollama_list(api_base='http://localhost:11434'):
    # call api http://localhost:11434/api/tags with http get request
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -45,7 +49,10 @@ def ollama_chat_endpoint(api_base='http://localhost:11434', model_name='llama3.2
    }
    return endpoint
-def ollama_chat(endpoint, prompt='Hello World', temperature=0.0, max_tokens=32768):
+def hex2base64(hex_string):
    return base64.b64encode(bytes.fromhex(hex_string)).decode('utf-8')
 def ollama_chat(endpoint, prompt='Hello World', base64_image=None, temperature=0.0, max_tokens=32768):
    # Disable SSL warnings
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -68,12 +75,38 @@ def ollama_chat(endpoint, prompt='Hello World', temperature=0.0, max_tokens=3276
    if modelname.startswith("o1") or modelname.startswith("gpt-o1"):
        temperature = 1.0 # o1 models need temperature 1.0
    else:
-        messages.append({"content": "You are a helpful assistant", "role": "system"})
+        messages.append({"role": "system", "content": "You are a helpful assistant"})
    if modelname.startswith("4o") or modelname.startswith("gpt-4o") or modelname.startswith("gpt-3.5"):
        # reduce number of stoptokes to 4
        stoptokens = ["[/INST]", "<|im_end|>", "<|end_of_turn|>", "<|eot_id|>"]
-    messages.append({"role": "user", "content": prompt})
+    if base64_image:
        image_type = "jpeg"
        #base64_magic = {"/9j/": "jpeg", "iVBO": "png", "Qk": "bmp", "R0lG": "gif", "SUkq": "tiff", "SUkr": "tiff", "TU0A": "tiff", "GkXf": "webp", "UklG": "webp"}
        base64_magic = {"/9j/": "jpeg", "iVBO": "png", "R0lG": "gif"} # only jpeg and png are allowed as data type; however all of the types above (but gif!) are supported by the API
        for magic, itype in base64_magic.items():
            if base64_image.startswith(magic):
                #print(f"Detected {itype} image")
                image_type = itype
                break
        # If this is a gif we must convert it to png
        if image_type == "gif":
            #print("Converting gif to png")
            image = Image.open(BytesIO(base64.b64decode(base64_image)))
            png_image = BytesIO()
            image.save(png_image, format="PNG")
            base64_image = base64.b64encode(png_image.getvalue()).decode('utf-8')
            image_type = "png"
        # Add the image to the message
        image_url_object = {"url": f"data:image/{image_type};base64,{base64_image}"}
        usermessage = {"role": "user", "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": image_url_object}
        ]}
    else:
        usermessage = {"role": "user", "content": prompt}
    messages.append(usermessage)
    if modelname.startswith("o1") or modelname.startswith("4o"):
        stoptokens = []
@@ -129,17 +162,31 @@ def ollama_chat(endpoint, prompt='Hello World', temperature=0.0, max_tokens=3276
    except json.JSONDecodeError as e:
        raise Exception(f"Failed to parse JSON response from the API: {e}")
 def test_multimodal(endpoint):
    image_path = "llmtest/testimage.png"
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    try:
        answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
        if "42" in answer:
            return True
        return False
    except Exception as e:
        return False
 def main():
    parser = ArgumentParser(description="Testing the ollama API.")
    parser.add_argument('--api_base', required=False, default='http://localhost:11434', help='API base URL for the LLM, default is http://localhost:11434')
    parser.add_argument('--endpoint', required=False, default='', help='Name of an <endpoint>.json file in the endpoints directory')
    parser.add_argument('--model', required=False, default='llama3.2:latest', help='Name of the model to use, default is llama3.2:latest')
    parser.add_argument('--image', required=False, default=None, help='path to an image that shall be processed')
    # parse the arguments
    args = parser.parse_args()
    api_base = args.api_base
    endpoint_name = args.endpoint
    model_name = args.model
    image_path = args.image
    # load the endpoint file
    endpoint = {}
@@ -154,11 +201,29 @@ def main():
    else:
        endpoint = ollama_chat_endpoint(api_base, model_name)
    # test if the endpoint is a multimodal model
    if test_multimodal(endpoint):
        print("Endpoint is a multimodal model.")
    else:
        print("Endpoint is not a multimodal model.")
    # load the image, if a path is given
    base64_image = None
    if image_path:
        with open(image_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    # access the ollama API
    models_dict = ollama_list()
    for (model, attr) in models_dict.items():
        print(f"Model: {model}")
-    answer, total_tokens, token_per_second = ollama_chat(endpoint)
+    try:
        if base64_image:
            answer, total_tokens, token_per_second = ollama_chat(endpoint, prompt="what is in the image", base64_image=base64_image)
        else:
            answer, total_tokens, token_per_second = ollama_chat(endpoint)
    except Exception as e:
        answer = f"Error: {str(e)}"
    print(answer)
 if __name__ == "__main__":
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 PIL
 sympy
 urllib3
 requests