Skip to content

Multimodal Guide

Learn how to work with images and text using DeepSeek's multimodal capabilities for visual understanding and analysis.

Overview

DeepSeek's multimodal API allows you to:

  • Analyze images: Describe, understand, and extract information from images
  • Visual Q&A: Ask questions about image content
  • OCR capabilities: Extract text from images
  • Chart analysis: Understand graphs, charts, and diagrams
  • Multi-image processing: Compare and analyze multiple images
  • Combined workflows: Integrate text and visual processing

Getting Started

Basic Image Analysis

python
from openai import OpenAI

client = OpenAI(
    api_key="sk-your-deepseek-key",
    base_url="https://api.deepseek.com/v1"
)

# Analyze an image from URL
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What do you see in this image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://example.com/image.jpg"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)

Base64 Image Input

python
import base64
from pathlib import Path

def encode_image(image_path: str) -> str:
    """Encode image to base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Load and encode image
image_path = "path/to/your/image.jpg"
base64_image = encode_image(image_path)

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Describe this image in detail."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)

Image Input Methods

Supported Formats

python
SUPPORTED_FORMATS = {
    "image/jpeg": [".jpg", ".jpeg"],
    "image/png": [".png"],
    "image/gif": [".gif"],
    "image/webp": [".webp"]
}

def validate_image_format(file_path: str) -> bool:
    """Validate if image format is supported"""
    
    file_extension = Path(file_path).suffix.lower()
    
    for mime_type, extensions in SUPPORTED_FORMATS.items():
        if file_extension in extensions:
            return True
    
    return False

# Usage
image_path = "example.jpg"
if validate_image_format(image_path):
    print("✅ Image format supported")
else:
    print("❌ Image format not supported")

Image Size Limits

python
from PIL import Image
import os

def check_image_constraints(image_path: str) -> dict:
    """Check if image meets API constraints"""
    
    # File size limit (example: 20MB)
    MAX_FILE_SIZE = 20 * 1024 * 1024  # 20MB in bytes
    
    # Get file size
    file_size = os.path.getsize(image_path)
    
    # Get image dimensions
    with Image.open(image_path) as img:
        width, height = img.size
    
    constraints = {
        "file_size_ok": file_size <= MAX_FILE_SIZE,
        "file_size_mb": file_size / (1024 * 1024),
        "dimensions": f"{width}x{height}",
        "format": img.format,
        "mode": img.mode
    }
    
    return constraints

# Usage
constraints = check_image_constraints("example.jpg")
print(f"File size OK: {constraints['file_size_ok']}")
print(f"Size: {constraints['file_size_mb']:.2f} MB")
print(f"Dimensions: {constraints['dimensions']}")

Image Preprocessing

python
from PIL import Image
import io
import base64

class ImageProcessor:
    """Process images for optimal API usage"""
    
    @staticmethod
    def resize_image(image_path: str, max_size: tuple = (1024, 1024)) -> str:
        """Resize image while maintaining aspect ratio"""
        
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Resize maintaining aspect ratio
            img.thumbnail(max_size, Image.Resampling.LANCZOS)
            
            # Save to bytes
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=85)
            buffer.seek(0)
            
            # Encode to base64
            return base64.b64encode(buffer.getvalue()).decode('utf-8')
    
    @staticmethod
    def compress_image(image_path: str, quality: int = 85) -> str:
        """Compress image to reduce file size"""
        
        with Image.open(image_path) as img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=quality, optimize=True)
            buffer.seek(0)
            
            return base64.b64encode(buffer.getvalue()).decode('utf-8')
    
    @staticmethod
    def prepare_image_for_api(image_path: str, max_size: tuple = (1024, 1024), quality: int = 85) -> str:
        """Prepare image for API with optimal settings"""
        
        # Check if image needs processing
        constraints = check_image_constraints(image_path)
        
        if constraints['file_size_mb'] > 10:  # If larger than 10MB
            print("Compressing large image...")
            return ImageProcessor.compress_image(image_path, quality)
        
        with Image.open(image_path) as img:
            if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
                print("Resizing large image...")
                return ImageProcessor.resize_image(image_path, max_size)
        
        # Image is fine as-is
        return encode_image(image_path)

# Usage
processor = ImageProcessor()
optimized_image = processor.prepare_image_for_api("large_image.jpg")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Analyze this optimized image."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{optimized_image}"
                    }
                }
            ]
        }
    ]
)

Advanced Multimodal Techniques

Multi-Image Analysis

python
def analyze_multiple_images(image_paths: list, question: str) -> str:
    """Analyze multiple images in a single request"""
    
    content = [{"type": "text", "text": question}]
    
    for i, image_path in enumerate(image_paths):
        base64_image = encode_image(image_path)
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
            }
        })
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": content
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage
image_paths = ["image1.jpg", "image2.jpg", "image3.jpg"]
result = analyze_multiple_images(
    image_paths, 
    "Compare these three images and identify the main differences."
)
print(result)

Image Detail Control

python
def analyze_with_detail_control(image_path: str, detail_level: str = "auto") -> str:
    """Control the level of detail in image analysis"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Provide a detailed analysis of this image."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": detail_level  # "low", "high", or "auto"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage examples
result_auto = analyze_with_detail_control("complex_chart.jpg", "auto")
result_high = analyze_with_detail_control("detailed_diagram.jpg", "high")
result_low = analyze_with_detail_control("simple_icon.jpg", "low")

Conversational Image Analysis

python
class ImageConversation:
    """Maintain conversation context with images"""
    
    def __init__(self):
        self.messages = []
    
    def add_image_message(self, image_path: str, text: str):
        """Add a message with image"""
        
        base64_image = encode_image(image_path)
        
        self.messages.append({
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": text
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        })
    
    def add_text_message(self, text: str, role: str = "user"):
        """Add a text-only message"""
        
        self.messages.append({
            "role": role,
            "content": text
        })
    
    def get_response(self) -> str:
        """Get AI response and add to conversation"""
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=self.messages
        )
        
        assistant_message = response.choices[0].message.content
        self.add_text_message(assistant_message, "assistant")
        
        return assistant_message
    
    def clear_conversation(self):
        """Clear conversation history"""
        self.messages = []

# Usage
conversation = ImageConversation()

# Start with an image
conversation.add_image_message("chart.jpg", "What does this chart show?")
response1 = conversation.get_response()
print("AI:", response1)

# Follow up with text
conversation.add_text_message("What are the key trends you notice?")
response2 = conversation.get_response()
print("AI:", response2)

# Add another image for comparison
conversation.add_image_message("chart2.jpg", "How does this compare to the previous chart?")
response3 = conversation.get_response()
print("AI:", response3)

Specialized Use Cases

OCR and Text Extraction

python
def extract_text_from_image(image_path: str) -> str:
    """Extract text from image using OCR capabilities"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Extract all text from this image. Provide the text exactly as it appears, maintaining formatting where possible."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

def extract_structured_text(image_path: str) -> dict:
    """Extract text and organize it into structured format"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """Extract text from this image and organize it into a JSON structure with the following format:
                        {
                            "title": "main title if any",
                            "headings": ["list of headings"],
                            "body_text": "main content",
                            "tables": ["any tabular data"],
                            "lists": ["any bullet points or lists"],
                            "other": "any other text"
                        }"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage
text_content = extract_text_from_image("document.jpg")
structured_content = extract_structured_text("document.jpg")

Chart and Graph Analysis

python
def analyze_chart(image_path: str, analysis_type: str = "comprehensive") -> str:
    """Analyze charts and graphs with specific focus"""
    
    analysis_prompts = {
        "comprehensive": "Provide a comprehensive analysis of this chart including data trends, key insights, and conclusions.",
        "trends": "Focus on identifying and describing the main trends shown in this chart.",
        "data_extraction": "Extract the specific data points and values shown in this chart.",
        "insights": "What are the key business insights that can be derived from this chart?",
        "comparison": "Compare the different data series or categories shown in this chart."
    }
    
    prompt = analysis_prompts.get(analysis_type, analysis_prompts["comprehensive"])
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage examples
comprehensive_analysis = analyze_chart("sales_chart.jpg", "comprehensive")
trend_analysis = analyze_chart("sales_chart.jpg", "trends")
data_extraction = analyze_chart("sales_chart.jpg", "data_extraction")

Visual Question Answering

python
class VisualQA:
    """Visual Question Answering system"""
    
    def __init__(self):
        self.image_cache = {}
    
    def load_image(self, image_path: str, image_id: str = None):
        """Load and cache image for multiple questions"""
        
        if image_id is None:
            image_id = image_path
        
        self.image_cache[image_id] = encode_image(image_path)
    
    def ask_question(self, question: str, image_id: str) -> str:
        """Ask a question about a cached image"""
        
        if image_id not in self.image_cache:
            raise ValueError(f"Image {image_id} not found in cache")
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": question
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{self.image_cache[image_id]}"
                            }
                        }
                    ]
                }
            ]
        )
        
        return response.choices[0].message.content
    
    def batch_questions(self, questions: list, image_id: str) -> dict:
        """Ask multiple questions about the same image"""
        
        results = {}
        
        for question in questions:
            results[question] = self.ask_question(question, image_id)
        
        return results

# Usage
vqa = VisualQA()

# Load image once
vqa.load_image("product_photo.jpg", "product")

# Ask multiple questions
questions = [
    "What color is the product?",
    "What is the brand name?",
    "What are the key features visible?",
    "Is there any text or labels?",
    "What is the approximate size?"
]

answers = vqa.batch_questions(questions, "product")

for question, answer in answers.items():
    print(f"Q: {question}")
    print(f"A: {answer}\n")

Document Analysis

python
def analyze_document(image_path: str, document_type: str = "general") -> dict:
    """Analyze different types of documents"""
    
    document_prompts = {
        "invoice": """Analyze this invoice and extract:
        - Invoice number
        - Date
        - Vendor information
        - Line items with quantities and prices
        - Total amount
        - Payment terms""",
        
        "receipt": """Analyze this receipt and extract:
        - Store name and location
        - Date and time
        - Items purchased with prices
        - Subtotal, tax, and total
        - Payment method""",
        
        "contract": """Analyze this contract and identify:
        - Parties involved
        - Key terms and conditions
        - Important dates
        - Financial obligations
        - Signatures and dates""",
        
        "form": """Analyze this form and extract:
        - Form title and purpose
        - All filled-in fields and values
        - Empty fields that need completion
        - Instructions or notes""",
        
        "general": """Analyze this document and provide:
        - Document type and purpose
        - Key information and data
        - Structure and organization
        - Important details"""
    }
    
    prompt = document_prompts.get(document_type, document_prompts["general"])
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return {
        "document_type": document_type,
        "analysis": response.choices[0].message.content,
        "image_path": image_path
    }

# Usage
invoice_analysis = analyze_document("invoice.jpg", "invoice")
receipt_analysis = analyze_document("receipt.jpg", "receipt")

Performance Optimization

Batch Processing

python
import asyncio
from concurrent.futures import ThreadPoolExecutor
import time

class MultimodalBatchProcessor:
    """Process multiple images efficiently"""
    
    def __init__(self, max_workers: int = 5):
        self.max_workers = max_workers
    
    def process_single_image(self, image_path: str, prompt: str) -> dict:
        """Process a single image"""
        
        start_time = time.time()
        
        try:
            base64_image = encode_image(image_path)
            
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ]
            )
            
            return {
                "image_path": image_path,
                "success": True,
                "result": response.choices[0].message.content,
                "processing_time": time.time() - start_time
            }
        
        except Exception as e:
            return {
                "image_path": image_path,
                "success": False,
                "error": str(e),
                "processing_time": time.time() - start_time
            }
    
    def process_batch(self, image_paths: list, prompt: str) -> list:
        """Process multiple images in parallel"""
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [
                executor.submit(self.process_single_image, image_path, prompt)
                for image_path in image_paths
            ]
            
            results = [future.result() for future in futures]
        
        return results
    
    def process_with_different_prompts(self, image_prompt_pairs: list) -> list:
        """Process images with different prompts"""
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [
                executor.submit(self.process_single_image, image_path, prompt)
                for image_path, prompt in image_prompt_pairs
            ]
            
            results = [future.result() for future in futures]
        
        return results

# Usage
processor = MultimodalBatchProcessor(max_workers=3)

# Process multiple images with same prompt
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
results = processor.process_batch(image_paths, "Describe this image in detail.")

# Process images with different prompts
image_prompt_pairs = [
    ("chart1.jpg", "Analyze this chart and extract key trends."),
    ("document1.jpg", "Extract all text from this document."),
    ("product1.jpg", "Describe this product and its features.")
]

results = processor.process_with_different_prompts(image_prompt_pairs)

# Print results
for result in results:
    if result["success"]:
        print(f"✅ {result['image_path']}: Processed in {result['processing_time']:.2f}s")
    else:
        print(f"❌ {result['image_path']}: Error - {result['error']}")

Caching and Optimization

python
import hashlib
import json
from pathlib import Path

class MultimodalCache:
    """Cache multimodal API responses"""
    
    def __init__(self, cache_dir: str = "./cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
    
    def _get_cache_key(self, image_path: str, prompt: str) -> str:
        """Generate cache key for image and prompt combination"""
        
        # Get image file hash
        with open(image_path, "rb") as f:
            image_hash = hashlib.md5(f.read()).hexdigest()
        
        # Get prompt hash
        prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
        
        return f"{image_hash}_{prompt_hash}"
    
    def get_cached_response(self, image_path: str, prompt: str) -> dict:
        """Get cached response if available"""
        
        cache_key = self._get_cache_key(image_path, prompt)
        cache_file = self.cache_dir / f"{cache_key}.json"
        
        if cache_file.exists():
            with open(cache_file, "r") as f:
                return json.load(f)
        
        return None
    
    def cache_response(self, image_path: str, prompt: str, response: str):
        """Cache API response"""
        
        cache_key = self._get_cache_key(image_path, prompt)
        cache_file = self.cache_dir / f"{cache_key}.json"
        
        cache_data = {
            "image_path": image_path,
            "prompt": prompt,
            "response": response,
            "timestamp": time.time()
        }
        
        with open(cache_file, "w") as f:
            json.dump(cache_data, f, indent=2)
    
    def analyze_with_cache(self, image_path: str, prompt: str) -> str:
        """Analyze image with caching"""
        
        # Check cache first
        cached = self.get_cached_response(image_path, prompt)
        if cached:
            print(f"✅ Using cached response for {image_path}")
            return cached["response"]
        
        # Make API call
        print(f"🔄 Making API call for {image_path}")
        base64_image = encode_image(image_path)
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        )
        
        result = response.choices[0].message.content
        
        # Cache the response
        self.cache_response(image_path, prompt, result)
        
        return result

# Usage
cache = MultimodalCache()

# First call - makes API request
result1 = cache.analyze_with_cache("image.jpg", "Describe this image.")

# Second call - uses cache
result2 = cache.analyze_with_cache("image.jpg", "Describe this image.")

Best Practices

Prompt Engineering for Vision

python
VISION_PROMPT_TEMPLATES = {
    "detailed_description": """Provide a detailed description of this image including:
    - Main subjects and objects
    - Colors, lighting, and composition
    - Setting and background
    - Any text or symbols visible
    - Overall mood or atmosphere""",
    
    "technical_analysis": """Analyze this image from a technical perspective:
    - Image quality and resolution
    - Composition and framing
    - Lighting conditions
    - Any technical issues or artifacts
    - Suggestions for improvement""",
    
    "accessibility_description": """Create an accessibility description for this image:
    - Describe all visual elements clearly
    - Include spatial relationships
    - Mention colors and their significance
    - Describe any text or important details
    - Keep it concise but comprehensive""",
    
    "data_extraction": """Extract all data and information from this image:
    - Any numerical data or statistics
    - Text content and labels
    - Categorical information
    - Relationships between elements
    - Structure the output clearly""",
    
    "comparison_analysis": """Compare and analyze the elements in this image:
    - Identify different categories or groups
    - Compare sizes, colors, or quantities
    - Note similarities and differences
    - Highlight the most significant findings"""
}

def get_optimized_prompt(task_type: str, custom_instructions: str = "") -> str:
    """Get optimized prompt for specific vision tasks"""
    
    base_prompt = VISION_PROMPT_TEMPLATES.get(task_type, VISION_PROMPT_TEMPLATES["detailed_description"])
    
    if custom_instructions:
        return f"{base_prompt}\n\nAdditional instructions: {custom_instructions}"
    
    return base_prompt

# Usage
prompt = get_optimized_prompt("data_extraction", "Focus on financial data and percentages")

Error Handling

python
def robust_image_analysis(image_path: str, prompt: str, max_retries: int = 3) -> dict:
    """Robust image analysis with error handling and retries"""
    
    for attempt in range(max_retries):
        try:
            # Validate image first
            if not Path(image_path).exists():
                return {"error": f"Image file not found: {image_path}"}
            
            if not validate_image_format(image_path):
                return {"error": f"Unsupported image format: {image_path}"}
            
            # Check image constraints
            constraints = check_image_constraints(image_path)
            if not constraints["file_size_ok"]:
                return {"error": f"Image too large: {constraints['file_size_mb']:.2f} MB"}
            
            # Process image
            base64_image = encode_image(image_path)
            
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                timeout=30  # 30 second timeout
            )
            
            return {
                "success": True,
                "result": response.choices[0].message.content,
                "attempt": attempt + 1
            }
        
        except Exception as e:
            error_msg = str(e)
            
            if attempt == max_retries - 1:  # Last attempt
                return {
                    "success": False,
                    "error": error_msg,
                    "attempts": max_retries
                }
            
            # Wait before retry
            time.sleep(2 ** attempt)  # Exponential backoff
    
    return {"success": False, "error": "Max retries exceeded"}

# Usage
result = robust_image_analysis("image.jpg", "Describe this image.")

if result.get("success"):
    print(f"✅ Success: {result['result']}")
else:
    print(f"❌ Error: {result['error']}")

Integration Examples

Web Application Integration

python
from flask import Flask, request, jsonify
import tempfile
import os

app = Flask(__name__)

@app.route('/analyze-image', methods=['POST'])
def analyze_image_endpoint():
    """Web endpoint for image analysis"""
    
    try:
        # Get uploaded file
        if 'image' not in request.files:
            return jsonify({"error": "No image file provided"}), 400
        
        file = request.files['image']
        prompt = request.form.get('prompt', 'Describe this image.')
        
        if file.filename == '':
            return jsonify({"error": "No file selected"}), 400
        
        # Save temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
            file.save(tmp_file.name)
            
            # Analyze image
            result = robust_image_analysis(tmp_file.name, prompt)
            
            # Clean up
            os.unlink(tmp_file.name)
            
            if result.get("success"):
                return jsonify({
                    "success": True,
                    "analysis": result["result"]
                })
            else:
                return jsonify({
                    "success": False,
                    "error": result["error"]
                }), 500
    
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)

Streamlit Application

python
import streamlit as st
from PIL import Image
import io

st.set_page_config(page_title="Multimodal Image Analyzer", layout="wide")

st.title("🖼️ Multimodal Image Analyzer")

# Sidebar configuration
with st.sidebar:
    st.header("Configuration")
    
    analysis_type = st.selectbox(
        "Analysis Type",
        ["General Description", "OCR Text Extraction", "Chart Analysis", "Document Analysis"]
    )
    
    custom_prompt = st.text_area(
        "Custom Prompt (optional)",
        placeholder="Enter your custom analysis prompt..."
    )

# Main interface
col1, col2 = st.columns([1, 1])

with col1:
    st.header("Upload Image")
    
    uploaded_file = st.file_uploader(
        "Choose an image...",
        type=['jpg', 'jpeg', 'png', 'gif', 'webp']
    )
    
    if uploaded_file is not None:
        # Display image
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        
        # Save temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
            image.save(tmp_file.name, format='JPEG')
            temp_path = tmp_file.name

with col2:
    st.header("Analysis Results")
    
    if uploaded_file is not None:
        if st.button("Analyze Image", type="primary"):
            with st.spinner("Analyzing image..."):
                # Determine prompt based on analysis type
                if custom_prompt:
                    prompt = custom_prompt
                else:
                    prompt_map = {
                        "General Description": "Provide a detailed description of this image.",
                        "OCR Text Extraction": "Extract all text from this image.",
                        "Chart Analysis": "Analyze this chart and provide insights about the data.",
                        "Document Analysis": "Analyze this document and extract key information."
                    }
                    prompt = prompt_map[analysis_type]
                
                # Analyze image
                result = robust_image_analysis(temp_path, prompt)
                
                if result.get("success"):
                    st.success("Analysis completed!")
                    st.write(result["result"])
                else:
                    st.error(f"Analysis failed: {result['error']}")
                
                # Clean up
                os.unlink(temp_path)

Troubleshooting

Common Issues

python
def diagnose_multimodal_issues(image_path: str, error_message: str) -> list:
    """Diagnose common multimodal API issues"""
    
    issues = []
    
    # Check file existence
    if not Path(image_path).exists():
        issues.append("❌ Image file does not exist")
        return issues
    
    # Check file format
    if not validate_image_format(image_path):
        issues.append("❌ Unsupported image format")
    
    # Check file size
    constraints = check_image_constraints(image_path)
    if not constraints["file_size_ok"]:
        issues.append(f"❌ Image too large: {constraints['file_size_mb']:.2f} MB")
    
    # Check error message patterns
    error_lower = error_message.lower()
    
    if "timeout" in error_lower:
        issues.append("❌ Request timeout - try reducing image size")
    
    if "rate limit" in error_lower:
        issues.append("❌ Rate limit exceeded - wait before retrying")
    
    if "invalid" in error_lower and "base64" in error_lower:
        issues.append("❌ Invalid base64 encoding - check image processing")
    
    if "model" in error_lower:
        issues.append("❌ Model error - ensure using 'deepseek-chat'")
    
    if not issues:
        issues.append("✅ No obvious issues detected")
    
    return issues

# Usage
issues = diagnose_multimodal_issues("problematic_image.jpg", "Request timeout error")
for issue in issues:
    print(issue)

Next Steps

基于 DeepSeek AI 大模型技术