Skip to content

Multimodal API

The DeepSeek Multimodal API enables you to work with both text and images in a single conversation, allowing for rich, context-aware interactions.

Overview

Our multimodal capabilities allow you to:

  • Analyze and describe images
  • Answer questions about visual content
  • Generate text based on image inputs
  • Combine text and image understanding in conversations

Supported Formats

Image Formats

  • JPEG (.jpg, .jpeg)
  • PNG (.png)
  • GIF (.gif)
  • WebP (.webp)
  • BMP (.bmp)

Image Constraints

  • Maximum file size: 20MB
  • Maximum dimensions: 4096 x 4096 pixels
  • Minimum dimensions: 32 x 32 pixels

Basic Usage

Image URL Input

python
from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.deepseek.com/v1"
)

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "What do you see in this image?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://example.com/image.jpg"
                    }
                }
            ]
        }
    ],
    max_tokens=500
)

print(response.choices[0].message.content)

Base64 Image Input

python
import base64

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Encode the image
base64_image = encode_image("path/to/your/image.jpg")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Describe this image in detail."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ],
    max_tokens=800
)

Advanced Features

Multiple Images

You can include multiple images in a single request:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Compare these two images and tell me the differences."
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/image1.jpg"}
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/image2.jpg"}
                }
            ]
        }
    ],
    max_tokens=1000
)

Image Detail Control

Control the level of detail in image processing:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Analyze this image in high detail."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://example.com/image.jpg",
                        "detail": "high"  # Options: "low", "high", "auto"
                    }
                }
            ]
        }
    ]
)

Detail Levels

  • low: Faster processing, basic image understanding
  • high: Detailed analysis, better for complex images
  • auto: Automatically chooses based on image complexity (default)

Use Cases

Image Description

Generate detailed descriptions of images:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Provide a detailed description of this image, including objects, people, setting, and mood."
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/photo.jpg"}
                }
            ]
        }
    ]
)

Visual Question Answering

Ask specific questions about image content:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "How many people are in this image? What are they doing?"
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/group_photo.jpg"}
                }
            ]
        }
    ]
)

OCR and Text Extraction

Extract and read text from images:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Extract all the text from this image and format it nicely."
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/document.jpg"}
                }
            ]
        }
    ]
)

Chart and Graph Analysis

Analyze data visualizations:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Analyze this chart. What trends do you see? Provide insights and key takeaways."
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/sales_chart.png"}
                }
            ]
        }
    ]
)

Product Analysis

Analyze products in images:

python
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Analyze this product image. Describe the product, its features, and suggest improvements."
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/product.jpg"}
                }
            ]
        }
    ]
)

Conversation Context

Multi-turn Conversations

Maintain context across multiple exchanges:

python
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "What do you see in this image?"
            },
            {
                "type": "image_url",
                "image_url": {"url": "https://example.com/kitchen.jpg"}
            }
        ]
    },
    {
        "role": "assistant",
        "content": "I see a modern kitchen with stainless steel appliances, granite countertops, and white cabinets..."
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "What cooking equipment do you see that would be good for baking?"
            }
        ]
    }
]

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=conversation
)

Image Reference in Follow-up

Reference previously shown images:

python
# First message with image
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Describe this room."
            },
            {
                "type": "image_url",
                "image_url": {"url": "https://example.com/room.jpg"}
            }
        ]
    }
]

# Get initial response
response1 = client.chat.completions.create(
    model="deepseek-chat",
    messages=messages
)

# Add assistant response to conversation
messages.append({
    "role": "assistant",
    "content": response1.choices[0].message.content
})

# Follow-up question about the same image
messages.append({
    "role": "user",
    "content": [
        {
            "type": "text",
            "text": "What changes would you suggest to make this room more modern?"
        }
    ]
})

response2 = client.chat.completions.create(
    model="deepseek-chat",
    messages=messages
)

Best Practices

Image Quality

  1. Use high-quality images: Better quality leads to better analysis
  2. Ensure good lighting: Well-lit images are easier to analyze
  3. Avoid blurry images: Sharp, clear images work best
  4. Consider composition: Well-framed subjects are easier to identify

Prompt Engineering

  1. Be specific: Ask clear, specific questions about the image
  2. Provide context: Give background information when helpful
  3. Use examples: Show the format you want for responses
  4. Break down complex tasks: Split complex analysis into steps
python
# Good prompt example
prompt = """
Analyze this product image and provide:
1. Product name and category
2. Key features visible in the image
3. Target audience
4. Pricing tier estimate (budget/mid-range/premium)
5. Marketing suggestions based on visual appeal
"""

Performance Optimization

  1. Optimize image size: Resize large images to reduce processing time
  2. Use appropriate detail level: Choose "low" for simple tasks
  3. Batch similar requests: Group related image analysis tasks
  4. Cache results: Store analysis results for repeated queries

Error Handling

python
import base64
from PIL import Image
import io

def safe_image_analysis(image_path, prompt):
    try:
        # Validate image
        with Image.open(image_path) as img:
            if img.size[0] > 4096 or img.size[1] > 4096:
                # Resize if too large
                img.thumbnail((4096, 4096), Image.Resampling.LANCZOS)
                
                # Convert to base64
                buffer = io.BytesIO()
                img.save(buffer, format='JPEG')
                base64_image = base64.b64encode(buffer.getvalue()).decode()
            else:
                # Use original image
                base64_image = encode_image(image_path)
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        )
        
        return {"success": True, "result": response.choices[0].message.content}
        
    except Exception as e:
        return {"success": False, "error": str(e)}

Limitations

Current Limitations

  1. No image generation: The API analyzes images but doesn't create them
  2. Static images only: No support for video or animated content
  3. No image editing: Cannot modify or enhance images
  4. Context window: Images consume significant tokens

Content Restrictions

  • No analysis of inappropriate or harmful content
  • Limited support for very low-quality or corrupted images
  • Cannot identify specific individuals (privacy protection)

Pricing

Token Calculation

Images are converted to tokens based on:

  • Image size and resolution
  • Detail level selected
  • Processing complexity

Cost Estimation

python
def estimate_image_tokens(width, height, detail="auto"):
    """Estimate token usage for an image"""
    if detail == "low":
        return 85  # Fixed cost for low detail
    
    # High detail calculation
    if width > 2048 or height > 2048:
        # Resize to fit within 2048x2048
        scale = min(2048/width, 2048/height)
        width = int(width * scale)
        height = int(height * scale)
    
    # Calculate tiles (512x512 each)
    tiles_width = (width + 511) // 512
    tiles_height = (height + 511) // 512
    total_tiles = tiles_width * tiles_height
    
    return 85 + (total_tiles * 170)  # Base cost + tile costs

Code Examples

Complete Image Analysis Application

python
import base64
import json
from openai import OpenAI
from PIL import Image
import io

class ImageAnalyzer:
    def __init__(self, api_key):
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.deepseek.com/v1"
        )
    
    def encode_image(self, image_path):
        """Encode image to base64"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    
    def analyze_image(self, image_path, prompt, detail="auto"):
        """Analyze an image with a custom prompt"""
        try:
            base64_image = self.encode_image(image_path)
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                    "detail": detail
                                }
                            }
                        ]
                    }
                ],
                max_tokens=1000
            )
            
            return {
                "success": True,
                "analysis": response.choices[0].message.content,
                "usage": response.usage
            }
            
        except Exception as e:
            return {"success": False, "error": str(e)}
    
    def extract_text(self, image_path):
        """Extract text from an image (OCR)"""
        prompt = "Extract all text from this image. Format it clearly and maintain the original structure."
        return self.analyze_image(image_path, prompt)
    
    def describe_image(self, image_path):
        """Generate a detailed description of the image"""
        prompt = "Provide a detailed description of this image, including objects, people, setting, colors, and overall composition."
        return self.analyze_image(image_path, prompt)
    
    def analyze_chart(self, image_path):
        """Analyze charts and graphs"""
        prompt = "Analyze this chart or graph. Describe the data, trends, key insights, and provide a summary of findings."
        return self.analyze_image(image_path, prompt, detail="high")

# Usage example
analyzer = ImageAnalyzer("YOUR_API_KEY")

# Analyze an image
result = analyzer.describe_image("path/to/image.jpg")
if result["success"]:
    print(result["analysis"])
else:
    print(f"Error: {result['error']}")

Batch Image Processing

python
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_images_batch(image_folder, output_file):
    """Process multiple images in parallel"""
    analyzer = ImageAnalyzer("YOUR_API_KEY")
    results = {}
    
    image_files = [f for f in os.listdir(image_folder) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp'))]
    
    def process_single_image(filename):
        image_path = os.path.join(image_folder, filename)
        result = analyzer.describe_image(image_path)
        return filename, result
    
    # Process images in parallel
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_filename = {
            executor.submit(process_single_image, filename): filename 
            for filename in image_files
        }
        
        for future in as_completed(future_to_filename):
            filename, result = future.result()
            results[filename] = result
            print(f"Processed: {filename}")
    
    # Save results
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    return results

# Usage
results = process_images_batch("./images", "analysis_results.json")

Next Steps

基于 DeepSeek AI 大模型技术