Multimodal Guide

Learn how to work with images and text using DeepSeek's multimodal capabilities for visual understanding and analysis.

Overview

DeepSeek's multimodal API allows you to:

Analyze images: Describe, understand, and extract information from images
Visual Q&A: Ask questions about image content
OCR capabilities: Extract text from images
Chart analysis: Understand graphs, charts, and diagrams
Multi-image processing: Compare and analyze multiple images
Combined workflows: Integrate text and visual processing

Getting Started

Basic Image Analysis

python

from openai import OpenAI

client = OpenAI(
    api_key="sk-your-deepseek-key",
    base_url="https://api.deepseek.com/v1"
)

# Analyze an image from URL
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What do you see in this image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://example.com/image.jpg"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)

Base64 Image Input

python

import base64
from pathlib import Path

def encode_image(image_path: str) -> str:
    """Encode image to base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Load and encode image
image_path = "path/to/your/image.jpg"
base64_image = encode_image(image_path)

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Describe this image in detail."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)

Image Input Methods

Supported Formats

python

SUPPORTED_FORMATS = {
    "image/jpeg": [".jpg", ".jpeg"],
    "image/png": [".png"],
    "image/gif": [".gif"],
    "image/webp": [".webp"]
}

def validate_image_format(file_path: str) -> bool:
    """Validate if image format is supported"""
    
    file_extension = Path(file_path).suffix.lower()
    
    for mime_type, extensions in SUPPORTED_FORMATS.items():
        if file_extension in extensions:
            return True
    
    return False

# Usage
image_path = "example.jpg"
if validate_image_format(image_path):
    print("✅ Image format supported")
else:
    print("❌ Image format not supported")

Image Size Limits

python

from PIL import Image
import os

def check_image_constraints(image_path: str) -> dict:
    """Check if image meets API constraints"""
    
    # File size limit (example: 20MB)
    MAX_FILE_SIZE = 20 * 1024 * 1024  # 20MB in bytes
    
    # Get file size
    file_size = os.path.getsize(image_path)
    
    # Get image dimensions
    with Image.open(image_path) as img:
        width, height = img.size
    
    constraints = {
        "file_size_ok": file_size <= MAX_FILE_SIZE,
        "file_size_mb": file_size / (1024 * 1024),
        "dimensions": f"{width}x{height}",
        "format": img.format,
        "mode": img.mode
    }
    
    return constraints

# Usage
constraints = check_image_constraints("example.jpg")
print(f"File size OK: {constraints['file_size_ok']}")
print(f"Size: {constraints['file_size_mb']:.2f} MB")
print(f"Dimensions: {constraints['dimensions']}")

Image Preprocessing

python

from PIL import Image
import io
import base64

class ImageProcessor:
    """Process images for optimal API usage"""
    
    @staticmethod
    def resize_image(image_path: str, max_size: tuple = (1024, 1024)) -> str:
        """Resize image while maintaining aspect ratio"""
        
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Resize maintaining aspect ratio
            img.thumbnail(max_size, Image.Resampling.LANCZOS)
            
            # Save to bytes
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=85)
            buffer.seek(0)
            
            # Encode to base64
            return base64.b64encode(buffer.getvalue()).decode('utf-8')
    
    @staticmethod
    def compress_image(image_path: str, quality: int = 85) -> str:
        """Compress image to reduce file size"""
        
        with Image.open(image_path) as img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=quality, optimize=True)
            buffer.seek(0)
            
            return base64.b64encode(buffer.getvalue()).decode('utf-8')
    
    @staticmethod
    def prepare_image_for_api(image_path: str, max_size: tuple = (1024, 1024), quality: int = 85) -> str:
        """Prepare image for API with optimal settings"""
        
        # Check if image needs processing
        constraints = check_image_constraints(image_path)
        
        if constraints['file_size_mb'] > 10:  # If larger than 10MB
            print("Compressing large image...")
            return ImageProcessor.compress_image(image_path, quality)
        
        with Image.open(image_path) as img:
            if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
                print("Resizing large image...")
                return ImageProcessor.resize_image(image_path, max_size)
        
        # Image is fine as-is
        return encode_image(image_path)

# Usage
processor = ImageProcessor()
optimized_image = processor.prepare_image_for_api("large_image.jpg")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Analyze this optimized image."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{optimized_image}"
                    }
                }
            ]
        }
    ]
)

Advanced Multimodal Techniques

Multi-Image Analysis

python

def analyze_multiple_images(image_paths: list, question: str) -> str:
    """Analyze multiple images in a single request"""
    
    content = [{"type": "text", "text": question}]
    
    for i, image_path in enumerate(image_paths):
        base64_image = encode_image(image_path)
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
            }
        })
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": content
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage
image_paths = ["image1.jpg", "image2.jpg", "image3.jpg"]
result = analyze_multiple_images(
    image_paths, 
    "Compare these three images and identify the main differences."
)
print(result)

Image Detail Control

python

def analyze_with_detail_control(image_path: str, detail_level: str = "auto") -> str:
    """Control the level of detail in image analysis"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Provide a detailed analysis of this image."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": detail_level  # "low", "high", or "auto"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage examples
result_auto = analyze_with_detail_control("complex_chart.jpg", "auto")
result_high = analyze_with_detail_control("detailed_diagram.jpg", "high")
result_low = analyze_with_detail_control("simple_icon.jpg", "low")

Conversational Image Analysis

python

class ImageConversation:
    """Maintain conversation context with images"""
    
    def __init__(self):
        self.messages = []
    
    def add_image_message(self, image_path: str, text: str):
        """Add a message with image"""
        
        base64_image = encode_image(image_path)
        
        self.messages.append({
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": text
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        })
    
    def add_text_message(self, text: str, role: str = "user"):
        """Add a text-only message"""
        
        self.messages.append({
            "role": role,
            "content": text
        })
    
    def get_response(self) -> str:
        """Get AI response and add to conversation"""
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=self.messages
        )
        
        assistant_message = response.choices[0].message.content
        self.add_text_message(assistant_message, "assistant")
        
        return assistant_message
    
    def clear_conversation(self):
        """Clear conversation history"""
        self.messages = []

# Usage
conversation = ImageConversation()

# Start with an image
conversation.add_image_message("chart.jpg", "What does this chart show?")
response1 = conversation.get_response()
print("AI:", response1)

# Follow up with text
conversation.add_text_message("What are the key trends you notice?")
response2 = conversation.get_response()
print("AI:", response2)

# Add another image for comparison
conversation.add_image_message("chart2.jpg", "How does this compare to the previous chart?")
response3 = conversation.get_response()
print("AI:", response3)

Specialized Use Cases

OCR and Text Extraction

python

def extract_text_from_image(image_path: str) -> str:
    """Extract text from image using OCR capabilities"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Extract all text from this image. Provide the text exactly as it appears, maintaining formatting where possible."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

def extract_structured_text(image_path: str) -> dict:
    """Extract text and organize it into structured format"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Extract text from this image and organize it into a JSON structure with the following format:
                        {
                            "title": "main title if any",
                            "headings": ["list of headings"],
                            "body_text": "main content",
                            "tables": ["any tabular data"],
                            "lists": ["any bullet points or lists"],
                            "other": "any other text"
                        }"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage
text_content = extract_text_from_image("document.jpg")
structured_content = extract_structured_text("document.jpg")

Chart and Graph Analysis

python

def analyze_chart(image_path: str, analysis_type: str = "comprehensive") -> str:
    """Analyze charts and graphs with specific focus"""
    
    analysis_prompts = {
        "comprehensive": "Provide a comprehensive analysis of this chart including data trends, key insights, and conclusions.",
        "trends": "Focus on identifying and describing the main trends shown in this chart.",
        "data_extraction": "Extract the specific data points and values shown in this chart.",
        "insights": "What are the key business insights that can be derived from this chart?",
        "comparison": "Compare the different data series or categories shown in this chart."
    }
    
    prompt = analysis_prompts.get(analysis_type, analysis_prompts["comprehensive"])
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage examples
comprehensive_analysis = analyze_chart("sales_chart.jpg", "comprehensive")
trend_analysis = analyze_chart("sales_chart.jpg", "trends")
data_extraction = analyze_chart("sales_chart.jpg", "data_extraction")

Visual Question Answering

python

class VisualQA:
    """Visual Question Answering system"""
    
    def __init__(self):
        self.image_cache = {}
    
    def load_image(self, image_path: str, image_id: str = None):
        """Load and cache image for multiple questions"""
        
        if image_id is None:
            image_id = image_path
        
        self.image_cache[image_id] = encode_image(image_path)
    
    def ask_question(self, question: str, image_id: str) -> str:
        """Ask a question about a cached image"""
        
        if image_id not in self.image_cache:
            raise ValueError(f"Image {image_id} not found in cache")
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": question
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{self.image_cache[image_id]}"
                            }
                        }
                    ]
                }
            ]
        )
        
        return response.choices[0].message.content
    
    def batch_questions(self, questions: list, image_id: str) -> dict:
        """Ask multiple questions about the same image"""
        
        results = {}
        
        for question in questions:
            results[question] = self.ask_question(question, image_id)
        
        return results

# Usage
vqa = VisualQA()

# Load image once
vqa.load_image("product_photo.jpg", "product")

# Ask multiple questions
questions = [
    "What color is the product?",
    "What is the brand name?",
    "What are the key features visible?",
    "Is there any text or labels?",
    "What is the approximate size?"
]

answers = vqa.batch_questions(questions, "product")

for question, answer in answers.items():
    print(f"Q: {question}")
    print(f"A: {answer}\n")

Document Analysis

python

def analyze_document(image_path: str, document_type: str = "general") -> dict:
    """Analyze different types of documents"""
    
    document_prompts = {
        "invoice": """Analyze this invoice and extract:
        - Invoice number
        - Date
        - Vendor information
        - Line items with quantities and prices
        - Total amount
        - Payment terms""",
        
        "receipt": """Analyze this receipt and extract:
        - Store name and location
        - Date and time
        - Items purchased with prices
        - Subtotal, tax, and total
        - Payment method""",
        
        "contract": """Analyze this contract and identify:
        - Parties involved
        - Key terms and conditions
        - Important dates
        - Financial obligations
        - Signatures and dates""",
        
        "form": """Analyze this form and extract:
        - Form title and purpose
        - All filled-in fields and values
        - Empty fields that need completion
        - Instructions or notes""",
        
        "general": """Analyze this document and provide:
        - Document type and purpose
        - Key information and data
        - Structure and organization
        - Important details"""
    }
    
    prompt = document_prompts.get(document_type, document_prompts["general"])
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return {
        "document_type": document_type,
        "analysis": response.choices[0].message.content,
        "image_path": image_path
    }

# Usage
invoice_analysis = analyze_document("invoice.jpg", "invoice")
receipt_analysis = analyze_document("receipt.jpg", "receipt")

Performance Optimization

Batch Processing

python

import asyncio
from concurrent.futures import ThreadPoolExecutor
import time

class MultimodalBatchProcessor:
    """Process multiple images efficiently"""
    
    def __init__(self, max_workers: int = 5):
        self.max_workers = max_workers
    
    def process_single_image(self, image_path: str, prompt: str) -> dict:
        """Process a single image"""
        
        start_time = time.time()
        
        try:
            base64_image = encode_image(image_path)
            
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ]
            )
            
            return {
                "image_path": image_path,
                "success": True,
                "result": response.choices[0].message.content,
                "processing_time": time.time() - start_time
            }
        
        except Exception as e:
            return {
                "image_path": image_path,
                "success": False,
                "error": str(e),
                "processing_time": time.time() - start_time
            }
    
    def process_batch(self, image_paths: list, prompt: str) -> list:
        """Process multiple images in parallel"""
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [
                executor.submit(self.process_single_image, image_path, prompt)
                for image_path in image_paths
            ]
            
            results = [future.result() for future in futures]
        
        return results
    
    def process_with_different_prompts(self, image_prompt_pairs: list) -> list:
        """Process images with different prompts"""
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [
                executor.submit(self.process_single_image, image_path, prompt)
                for image_path, prompt in image_prompt_pairs
            ]
            
            results = [future.result() for future in futures]
        
        return results

# Usage
processor = MultimodalBatchProcessor(max_workers=3)

# Process multiple images with same prompt
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
results = processor.process_batch(image_paths, "Describe this image in detail.")

# Process images with different prompts
image_prompt_pairs = [
    ("chart1.jpg", "Analyze this chart and extract key trends."),
    ("document1.jpg", "Extract all text from this document."),
    ("product1.jpg", "Describe this product and its features.")
]

results = processor.process_with_different_prompts(image_prompt_pairs)

# Print results
for result in results:
    if result["success"]:
        print(f"✅ {result['image_path']}: Processed in {result['processing_time']:.2f}s")
    else:
        print(f"❌ {result['image_path']}: Error - {result['error']}")

Caching and Optimization

python

import hashlib
import json
from pathlib import Path

class MultimodalCache:
    """Cache multimodal API responses"""
    
    def __init__(self, cache_dir: str = "./cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
    
    def _get_cache_key(self, image_path: str, prompt: str) -> str:
        """Generate cache key for image and prompt combination"""
        
        # Get image file hash
        with open(image_path, "rb") as f:
            image_hash = hashlib.md5(f.read()).hexdigest()
        
        # Get prompt hash
        prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
        
        return f"{image_hash}_{prompt_hash}"
    
    def get_cached_response(self, image_path: str, prompt: str) -> dict:
        """Get cached response if available"""
        
        cache_key = self._get_cache_key(image_path, prompt)
        cache_file = self.cache_dir / f"{cache_key}.json"
        
        if cache_file.exists():
            with open(cache_file, "r") as f:
                return json.load(f)
        
        return None
    
    def cache_response(self, image_path: str, prompt: str, response: str):
        """Cache API response"""
        
        cache_key = self._get_cache_key(image_path, prompt)
        cache_file = self.cache_dir / f"{cache_key}.json"
        
        cache_data = {
            "image_path": image_path,
            "prompt": prompt,
            "response": response,
            "timestamp": time.time()
        }
        
        with open(cache_file, "w") as f:
            json.dump(cache_data, f, indent=2)
    
    def analyze_with_cache(self, image_path: str, prompt: str) -> str:
        """Analyze image with caching"""
        
        # Check cache first
        cached = self.get_cached_response(image_path, prompt)
        if cached:
            print(f"✅ Using cached response for {image_path}")
            return cached["response"]
        
        # Make API call
        print(f"🔄 Making API call for {image_path}")
        base64_image = encode_image(image_path)
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        )
        
        result = response.choices[0].message.content
        
        # Cache the response
        self.cache_response(image_path, prompt, result)
        
        return result

# Usage
cache = MultimodalCache()

# First call - makes API request
result1 = cache.analyze_with_cache("image.jpg", "Describe this image.")

# Second call - uses cache
result2 = cache.analyze_with_cache("image.jpg", "Describe this image.")

Best Practices

Prompt Engineering for Vision

python

VISION_PROMPT_TEMPLATES = {
    "detailed_description": """Provide a detailed description of this image including:
    - Main subjects and objects
    - Colors, lighting, and composition
    - Setting and background
    - Any text or symbols visible
    - Overall mood or atmosphere""",
    
    "technical_analysis": """Analyze this image from a technical perspective:
    - Image quality and resolution
    - Composition and framing
    - Lighting conditions
    - Any technical issues or artifacts
    - Suggestions for improvement""",
    
    "accessibility_description": """Create an accessibility description for this image:
    - Describe all visual elements clearly
    - Include spatial relationships
    - Mention colors and their significance
    - Describe any text or important details
    - Keep it concise but comprehensive""",
    
    "data_extraction": """Extract all data and information from this image:
    - Any numerical data or statistics
    - Text content and labels
    - Categorical information
    - Relationships between elements
    - Structure the output clearly""",
    
    "comparison_analysis": """Compare and analyze the elements in this image:
    - Identify different categories or groups
    - Compare sizes, colors, or quantities
    - Note similarities and differences
    - Highlight the most significant findings"""
}

def get_optimized_prompt(task_type: str, custom_instructions: str = "") -> str:
    """Get optimized prompt for specific vision tasks"""
    
    base_prompt = VISION_PROMPT_TEMPLATES.get(task_type, VISION_PROMPT_TEMPLATES["detailed_description"])
    
    if custom_instructions:
        return f"{base_prompt}\n\nAdditional instructions: {custom_instructions}"
    
    return base_prompt

# Usage
prompt = get_optimized_prompt("data_extraction", "Focus on financial data and percentages")

Error Handling

python

def robust_image_analysis(image_path: str, prompt: str, max_retries: int = 3) -> dict:
    """Robust image analysis with error handling and retries"""
    
    for attempt in range(max_retries):
        try:
            # Validate image first
            if not Path(image_path).exists():
                return {"error": f"Image file not found: {image_path}"}
            
            if not validate_image_format(image_path):
                return {"error": f"Unsupported image format: {image_path}"}
            
            # Check image constraints
            constraints = check_image_constraints(image_path)
            if not constraints["file_size_ok"]:
                return {"error": f"Image too large: {constraints['file_size_mb']:.2f} MB"}
            
            # Process image
            base64_image = encode_image(image_path)
            
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                timeout=30  # 30 second timeout
            )
            
            return {
                "success": True,
                "result": response.choices[0].message.content,
                "attempt": attempt + 1
            }
        
        except Exception as e:
            error_msg = str(e)
            
            if attempt == max_retries - 1:  # Last attempt
                return {
                    "success": False,
                    "error": error_msg,
                    "attempts": max_retries
                }
            
            # Wait before retry
            time.sleep(2 ** attempt)  # Exponential backoff
    
    return {"success": False, "error": "Max retries exceeded"}

# Usage
result = robust_image_analysis("image.jpg", "Describe this image.")

if result.get("success"):
    print(f"✅ Success: {result['result']}")
else:
    print(f"❌ Error: {result['error']}")

Integration Examples

Web Application Integration

python

from flask import Flask, request, jsonify
import tempfile
import os

app = Flask(__name__)

@app.route('/analyze-image', methods=['POST'])
def analyze_image_endpoint():
    """Web endpoint for image analysis"""
    
    try:
        # Get uploaded file
        if 'image' not in request.files:
            return jsonify({"error": "No image file provided"}), 400
        
        file = request.files['image']
        prompt = request.form.get('prompt', 'Describe this image.')
        
        if file.filename == '':
            return jsonify({"error": "No file selected"}), 400
        
        # Save temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
            file.save(tmp_file.name)
            
            # Analyze image
            result = robust_image_analysis(tmp_file.name, prompt)
            
            # Clean up
            os.unlink(tmp_file.name)
            
            if result.get("success"):
                return jsonify({
                    "success": True,
                    "analysis": result["result"]
                })
            else:
                return jsonify({
                    "success": False,
                    "error": result["error"]
                }), 500
    
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)

Streamlit Application

python

import streamlit as st
from PIL import Image
import io

st.set_page_config(page_title="Multimodal Image Analyzer", layout="wide")

st.title("🖼️ Multimodal Image Analyzer")

# Sidebar configuration
with st.sidebar:
    st.header("Configuration")
    
    analysis_type = st.selectbox(
        "Analysis Type",
        ["General Description", "OCR Text Extraction", "Chart Analysis", "Document Analysis"]
    )
    
    custom_prompt = st.text_area(
        "Custom Prompt (optional)",
        placeholder="Enter your custom analysis prompt..."
    )

# Main interface
col1, col2 = st.columns([1, 1])

with col1:
    st.header("Upload Image")
    
    uploaded_file = st.file_uploader(
        "Choose an image...",
        type=['jpg', 'jpeg', 'png', 'gif', 'webp']
    )
    
    if uploaded_file is not None:
        # Display image
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        
        # Save temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
            image.save(tmp_file.name, format='JPEG')
            temp_path = tmp_file.name

with col2:
    st.header("Analysis Results")
    
    if uploaded_file is not None:
        if st.button("Analyze Image", type="primary"):
            with st.spinner("Analyzing image..."):
                # Determine prompt based on analysis type
                if custom_prompt:
                    prompt = custom_prompt
                else:
                    prompt_map = {
                        "General Description": "Provide a detailed description of this image.",
                        "OCR Text Extraction": "Extract all text from this image.",
                        "Chart Analysis": "Analyze this chart and provide insights about the data.",
                        "Document Analysis": "Analyze this document and extract key information."
                    }
                    prompt = prompt_map[analysis_type]
                
                # Analyze image
                result = robust_image_analysis(temp_path, prompt)
                
                if result.get("success"):
                    st.success("Analysis completed!")
                    st.write(result["result"])
                else:
                    st.error(f"Analysis failed: {result['error']}")
                
                # Clean up
                os.unlink(temp_path)

Troubleshooting

Common Issues

python

def diagnose_multimodal_issues(image_path: str, error_message: str) -> list:
    """Diagnose common multimodal API issues"""
    
    issues = []
    
    # Check file existence
    if not Path(image_path).exists():
        issues.append("❌ Image file does not exist")
        return issues
    
    # Check file format
    if not validate_image_format(image_path):
        issues.append("❌ Unsupported image format")
    
    # Check file size
    constraints = check_image_constraints(image_path)
    if not constraints["file_size_ok"]:
        issues.append(f"❌ Image too large: {constraints['file_size_mb']:.2f} MB")
    
    # Check error message patterns
    error_lower = error_message.lower()
    
    if "timeout" in error_lower:
        issues.append("❌ Request timeout - try reducing image size")
    
    if "rate limit" in error_lower:
        issues.append("❌ Rate limit exceeded - wait before retrying")
    
    if "invalid" in error_lower and "base64" in error_lower:
        issues.append("❌ Invalid base64 encoding - check image processing")
    
    if "model" in error_lower:
        issues.append("❌ Model error - ensure using 'deepseek-chat'")
    
    if not issues:
        issues.append("✅ No obvious issues detected")
    
    return issues

# Usage
issues = diagnose_multimodal_issues("problematic_image.jpg", "Request timeout error")
for issue in issues:
    print(issue)

Multimodal Guide ​

Overview ​

Getting Started ​

Basic Image Analysis ​

Base64 Image Input ​

Image Input Methods ​

Supported Formats ​

Image Size Limits ​

Image Preprocessing ​

Advanced Multimodal Techniques ​

Multi-Image Analysis ​

Image Detail Control ​

Conversational Image Analysis ​

Specialized Use Cases ​

OCR and Text Extraction ​

Chart and Graph Analysis ​

Visual Question Answering ​

Document Analysis ​

Performance Optimization ​

Batch Processing ​

Caching and Optimization ​

Best Practices ​

Prompt Engineering for Vision ​

Error Handling ​

Integration Examples ​

Web Application Integration ​

Streamlit Application ​

Troubleshooting ​

Common Issues ​

Next Steps ​

Multimodal Guide

Overview

Getting Started

Basic Image Analysis

Base64 Image Input

Image Input Methods

Supported Formats

Image Size Limits

Image Preprocessing

Advanced Multimodal Techniques

Multi-Image Analysis

Image Detail Control

Conversational Image Analysis

Specialized Use Cases

OCR and Text Extraction

Chart and Graph Analysis

Visual Question Answering

Document Analysis

Performance Optimization

Batch Processing

Caching and Optimization

Best Practices

Prompt Engineering for Vision

Error Handling

Integration Examples

Web Application Integration

Streamlit Application

Troubleshooting

Common Issues

Next Steps