Skip to content

Long Text Processing Guide

Learn how to effectively handle long documents, large datasets, and extended conversations with DeepSeek's advanced text processing capabilities.

Overview

DeepSeek supports processing of long texts with:

  • Extended context length: Up to 128K tokens for comprehensive analysis
  • Efficient chunking: Smart text segmentation strategies
  • Context preservation: Maintain coherence across long documents
  • Memory optimization: Efficient handling of large inputs
  • Streaming support: Process long texts with real-time output

Context Length Limits

Understanding Token Limits

python
import tiktoken
from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.deepseek.com/v1"
)

def count_tokens(text: str, model: str = "deepseek-chat") -> int:
    """Count tokens in text for the specified model"""
    
    # Use tiktoken for token counting
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")  # Similar tokenization
    tokens = encoding.encode(text)
    return len(tokens)

def check_context_limit(messages: list, model: str = "deepseek-chat") -> dict:
    """Check if messages fit within context limit"""
    
    # Model limits
    limits = {
        "deepseek-chat": 128000,  # 128K tokens
        "deepseek-coder": 128000,
        "deepseek-math": 128000
    }
    
    total_tokens = 0
    for message in messages:
        total_tokens += count_tokens(message["content"])
    
    limit = limits.get(model, 128000)
    
    return {
        "total_tokens": total_tokens,
        "limit": limit,
        "within_limit": total_tokens <= limit,
        "usage_percentage": (total_tokens / limit) * 100,
        "remaining_tokens": limit - total_tokens
    }

# Example usage
long_document = """
[Your long document content here...]
"""

messages = [
    {"role": "system", "content": "You are a document analysis assistant."},
    {"role": "user", "content": f"Please analyze this document:\n\n{long_document}"}
]

context_info = check_context_limit(messages)
print(f"Token usage: {context_info['total_tokens']}/{context_info['limit']} ({context_info['usage_percentage']:.1f}%)")

Model-Specific Limits

python
class ContextManager:
    """Manage context limits for different models"""
    
    MODEL_LIMITS = {
        "deepseek-chat": 128000,
        "deepseek-coder": 128000,
        "deepseek-math": 128000
    }
    
    def __init__(self, model: str = "deepseek-chat"):
        self.model = model
        self.limit = self.MODEL_LIMITS.get(model, 128000)
    
    def can_fit(self, text: str) -> bool:
        """Check if text fits within model limit"""
        return count_tokens(text) <= self.limit
    
    def get_max_input_size(self, reserved_for_response: int = 4000) -> int:
        """Get maximum input size, reserving tokens for response"""
        return self.limit - reserved_for_response
    
    def estimate_response_tokens(self, input_tokens: int, task_type: str = "general") -> int:
        """Estimate response tokens based on task type"""
        
        estimates = {
            "summary": input_tokens * 0.1,      # 10% of input
            "analysis": input_tokens * 0.3,     # 30% of input
            "translation": input_tokens * 1.2,  # 120% of input
            "code_generation": input_tokens * 0.5,  # 50% of input
            "general": input_tokens * 0.2       # 20% of input
        }
        
        return int(estimates.get(task_type, estimates["general"]))

# Usage
manager = ContextManager("deepseek-chat")
max_input = manager.get_max_input_size()
print(f"Maximum input size: {max_input} tokens")

Text Chunking Strategies

Smart Chunking

python
import re
from typing import List, Tuple

class SmartChunker:
    """Intelligent text chunking with context preservation"""
    
    def __init__(self, max_chunk_size: int = 8000, overlap_size: int = 200):
        self.max_chunk_size = max_chunk_size
        self.overlap_size = overlap_size
    
    def chunk_by_paragraphs(self, text: str) -> List[str]:
        """Chunk text by paragraphs, respecting natural boundaries"""
        
        # Split by double newlines (paragraphs)
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""
        
        for paragraph in paragraphs:
            paragraph = paragraph.strip()
            if not paragraph:
                continue
            
            # Check if adding this paragraph exceeds limit
            test_chunk = current_chunk + "\n\n" + paragraph if current_chunk else paragraph
            
            if count_tokens(test_chunk) <= self.max_chunk_size:
                current_chunk = test_chunk
            else:
                # Save current chunk and start new one
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = paragraph
        
        # Add the last chunk
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks
    
    def chunk_by_sentences(self, text: str) -> List[str]:
        """Chunk text by sentences for better coherence"""
        
        # Split by sentence endings
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            test_chunk = current_chunk + " " + sentence if current_chunk else sentence
            
            if count_tokens(test_chunk) <= self.max_chunk_size:
                current_chunk = test_chunk
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence
        
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks
    
    def chunk_with_overlap(self, text: str) -> List[Tuple[str, int, int]]:
        """Chunk text with overlapping sections for context preservation"""
        
        chunks = []
        start = 0
        text_length = len(text)
        
        while start < text_length:
            # Calculate end position
            end = min(start + self.max_chunk_size, text_length)
            
            # Find a good breaking point (sentence or paragraph end)
            if end < text_length:
                # Look for sentence ending within last 200 characters
                search_start = max(end - 200, start)
                sentence_end = text.rfind('.', search_start, end)
                if sentence_end > start:
                    end = sentence_end + 1
            
            chunk = text[start:end].strip()
            chunks.append((chunk, start, end))
            
            # Move start position with overlap
            start = max(start + self.max_chunk_size - self.overlap_size, end)
        
        return chunks
    
    def chunk_by_structure(self, text: str, structure_markers: List[str] = None) -> List[str]:
        """Chunk text based on structural markers (headers, sections)"""
        
        if structure_markers is None:
            structure_markers = [
                r'^#+ ',      # Markdown headers
                r'^\d+\.',    # Numbered sections
                r'^Chapter ', # Chapter markers
                r'^Section ', # Section markers
            ]
        
        # Find all structure markers
        markers = []
        for i, line in enumerate(text.split('\n')):
            for pattern in structure_markers:
                if re.match(pattern, line):
                    markers.append(i)
                    break
        
        if not markers:
            # No structure found, fall back to paragraph chunking
            return self.chunk_by_paragraphs(text)
        
        lines = text.split('\n')
        chunks = []
        
        for i in range(len(markers)):
            start_line = markers[i]
            end_line = markers[i + 1] if i + 1 < len(markers) else len(lines)
            
            section = '\n'.join(lines[start_line:end_line]).strip()
            
            # If section is too large, further chunk it
            if count_tokens(section) > self.max_chunk_size:
                sub_chunks = self.chunk_by_paragraphs(section)
                chunks.extend(sub_chunks)
            else:
                chunks.append(section)
        
        return chunks

# Example usage
chunker = SmartChunker(max_chunk_size=6000, overlap_size=300)

long_text = """
Your very long document content here...
Multiple paragraphs, sections, etc.
"""

# Different chunking strategies
paragraph_chunks = chunker.chunk_by_paragraphs(long_text)
sentence_chunks = chunker.chunk_by_sentences(long_text)
overlap_chunks = chunker.chunk_with_overlap(long_text)
structure_chunks = chunker.chunk_by_structure(long_text)

print(f"Paragraph chunks: {len(paragraph_chunks)}")
print(f"Sentence chunks: {len(sentence_chunks)}")
print(f"Overlap chunks: {len(overlap_chunks)}")
print(f"Structure chunks: {len(structure_chunks)}")

Semantic Chunking

python
class SemanticChunker:
    """Chunk text based on semantic similarity"""
    
    def __init__(self, client, max_chunk_size: int = 6000):
        self.client = client
        self.max_chunk_size = max_chunk_size
    
    def get_semantic_boundaries(self, text: str) -> List[int]:
        """Find semantic boundaries in text"""
        
        paragraphs = text.split('\n\n')
        boundaries = []
        
        for i in range(len(paragraphs) - 1):
            # Get semantic similarity between adjacent paragraphs
            similarity = self.calculate_semantic_similarity(
                paragraphs[i], 
                paragraphs[i + 1]
            )
            
            # If similarity is low, it's a good boundary
            if similarity < 0.5:  # Threshold for semantic break
                boundaries.append(i + 1)
        
        return boundaries
    
    def calculate_semantic_similarity(self, text1: str, text2: str) -> float:
        """Calculate semantic similarity between two texts"""
        
        prompt = f"""
Rate the semantic similarity between these two text segments on a scale of 0.0 to 1.0:

Text 1: {text1[:500]}...
Text 2: {text2[:500]}...

Return only a number between 0.0 and 1.0, where:
- 0.0 = completely different topics
- 1.0 = same topic and context

Similarity score:"""
        
        try:
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                max_tokens=10,
                temperature=0
            )
            
            score_text = response.choices[0].message.content.strip()
            return float(score_text)
        except:
            return 0.5  # Default similarity if calculation fails
    
    def chunk_semantically(self, text: str) -> List[str]:
        """Chunk text based on semantic boundaries"""
        
        paragraphs = text.split('\n\n')
        boundaries = self.get_semantic_boundaries(text)
        
        chunks = []
        start = 0
        
        for boundary in boundaries + [len(paragraphs)]:
            chunk_paragraphs = paragraphs[start:boundary]
            chunk = '\n\n'.join(chunk_paragraphs)
            
            # If chunk is too large, split it further
            if count_tokens(chunk) > self.max_chunk_size:
                sub_chunker = SmartChunker(self.max_chunk_size)
                sub_chunks = sub_chunker.chunk_by_paragraphs(chunk)
                chunks.extend(sub_chunks)
            else:
                chunks.append(chunk)
            
            start = boundary
        
        return chunks

# Usage (requires API calls, use sparingly)
# semantic_chunker = SemanticChunker(client)
# semantic_chunks = semantic_chunker.chunk_semantically(long_text)

Document Processing Workflows

Sequential Processing

python
class DocumentProcessor:
    """Process long documents sequentially with context preservation"""
    
    def __init__(self, client, chunker: SmartChunker = None):
        self.client = client
        self.chunker = chunker or SmartChunker()
    
    def process_document_sequentially(self, document: str, task: str, 
                                    context_preservation: bool = True) -> List[str]:
        """Process document in chunks while preserving context"""
        
        chunks = self.chunker.chunk_by_paragraphs(document)
        results = []
        previous_context = ""
        
        for i, chunk in enumerate(chunks):
            # Build context-aware prompt
            if context_preservation and previous_context:
                prompt = f"""
Previous context: {previous_context[-500:]}...

Current section: {chunk}

Task: {task}

Please process the current section while considering the previous context.
"""
            else:
                prompt = f"""
Section {i+1} of {len(chunks)}:

{chunk}

Task: {task}
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a document processing assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            
            result = response.choices[0].message.content
            results.append(result)
            
            # Update context for next iteration
            if context_preservation:
                previous_context = chunk[-300:] + " " + result[-200:]
        
        return results
    
    def summarize_long_document(self, document: str, summary_type: str = "comprehensive") -> str:
        """Create a summary of a long document"""
        
        # First pass: summarize each chunk
        chunk_summaries = self.process_document_sequentially(
            document, 
            f"Create a {summary_type} summary of this section"
        )
        
        # Second pass: combine summaries
        combined_summaries = "\n\n".join(chunk_summaries)
        
        if count_tokens(combined_summaries) > 8000:
            # If combined summaries are still too long, summarize again
            final_summary_chunks = self.chunker.chunk_by_paragraphs(combined_summaries)
            final_summaries = []
            
            for chunk in final_summary_chunks:
                response = self.client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[
                        {"role": "system", "content": "You are a summarization expert."},
                        {"role": "user", "content": f"Condense this summary further:\n\n{chunk}"}
                    ]
                )
                final_summaries.append(response.choices[0].message.content)
            
            combined_summaries = "\n\n".join(final_summaries)
        
        # Final consolidation
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a document summarization expert."},
                {"role": "user", "content": f"""
Create a final {summary_type} summary from these section summaries:

{combined_summaries}

Ensure the final summary is coherent, comprehensive, and well-structured.
"""}
            ]
        )
        
        return response.choices[0].message.content
    
    def extract_key_information(self, document: str, extraction_criteria: List[str]) -> dict:
        """Extract specific information from long document"""
        
        criteria_text = "\n".join([f"- {criterion}" for criterion in extraction_criteria])
        
        chunks = self.chunker.chunk_by_paragraphs(document)
        extracted_info = {criterion: [] for criterion in extraction_criteria}
        
        for chunk in chunks:
            prompt = f"""
Extract the following information from this text section:

{criteria_text}

Text section:
{chunk}

Return the information in a structured format. If information is not found, indicate "Not found".
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are an information extraction specialist."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"}
            )
            
            try:
                chunk_info = json.loads(response.choices[0].message.content)
                for criterion in extraction_criteria:
                    if criterion in chunk_info and chunk_info[criterion] != "Not found":
                        extracted_info[criterion].append(chunk_info[criterion])
            except:
                continue
        
        # Consolidate extracted information
        consolidated = {}
        for criterion, values in extracted_info.items():
            if values:
                consolidated[criterion] = list(set(values))  # Remove duplicates
            else:
                consolidated[criterion] = "Not found"
        
        return consolidated

# Example usage
processor = DocumentProcessor(client)

# Summarize long document
summary = processor.summarize_long_document(
    long_document, 
    summary_type="executive"
)

# Extract specific information
key_info = processor.extract_key_information(
    long_document,
    [
        "main_conclusions",
        "key_statistics",
        "recommendations",
        "important_dates",
        "mentioned_companies"
    ]
)

print("Summary:", summary)
print("Key Information:", key_info)

Parallel Processing

python
import asyncio
import aiohttp
from typing import List, Dict, Any

class ParallelDocumentProcessor:
    """Process document chunks in parallel for faster processing"""
    
    def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
        self.api_key = api_key
        self.base_url = base_url
        self.chunker = SmartChunker()
    
    async def process_chunk_async(self, session: aiohttp.ClientSession, 
                                chunk: str, task: str, chunk_id: int) -> Dict[str, Any]:
        """Process a single chunk asynchronously"""
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "deepseek-chat",
            "messages": [
                {"role": "system", "content": "You are a document processing assistant."},
                {"role": "user", "content": f"Chunk {chunk_id}: {task}\n\n{chunk}"}
            ]
        }
        
        async with session.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=payload
        ) as response:
            result = await response.json()
            return {
                "chunk_id": chunk_id,
                "result": result["choices"][0]["message"]["content"],
                "chunk": chunk[:100] + "..."  # Store snippet for reference
            }
    
    async def process_document_parallel(self, document: str, task: str, 
                                      max_concurrent: int = 5) -> List[Dict[str, Any]]:
        """Process document chunks in parallel"""
        
        chunks = self.chunker.chunk_by_paragraphs(document)
        
        # Create semaphore to limit concurrent requests
        semaphore = asyncio.Semaphore(max_concurrent)
        
        async def process_with_semaphore(session, chunk, task, chunk_id):
            async with semaphore:
                return await self.process_chunk_async(session, chunk, task, chunk_id)
        
        async with aiohttp.ClientSession() as session:
            tasks = [
                process_with_semaphore(session, chunk, task, i)
                for i, chunk in enumerate(chunks)
            ]
            
            results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Filter out exceptions and sort by chunk_id
        valid_results = [r for r in results if not isinstance(r, Exception)]
        valid_results.sort(key=lambda x: x["chunk_id"])
        
        return valid_results
    
    def run_parallel_processing(self, document: str, task: str) -> List[str]:
        """Run parallel processing (synchronous wrapper)"""
        
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        
        try:
            results = loop.run_until_complete(
                self.process_document_parallel(document, task)
            )
            return [result["result"] for result in results]
        finally:
            loop.close()

# Example usage
# parallel_processor = ParallelDocumentProcessor("YOUR_API_KEY")
# parallel_results = parallel_processor.run_parallel_processing(
#     long_document,
#     "Analyze the main themes and arguments in this section"
# )

Memory-Efficient Processing

Streaming Long Text

python
class StreamingProcessor:
    """Process long texts with streaming for memory efficiency"""
    
    def __init__(self, client):
        self.client = client
    
    def stream_long_text_analysis(self, text: str, analysis_type: str):
        """Stream analysis of long text"""
        
        prompt = f"""
Perform {analysis_type} analysis of this text. Stream your response as you analyze:

{text}
"""
        
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a text analysis expert. Provide streaming analysis."},
                {"role": "user", "content": prompt}
            ],
            stream=True
        )
        
        full_analysis = ""
        for chunk in response:
            if chunk.choices[0].delta.content:
                content = chunk.choices[0].delta.content
                full_analysis += content
                print(content, end="", flush=True)
        
        return full_analysis
    
    def progressive_summarization(self, text: str, target_length: int = 500):
        """Progressively summarize text to target length"""
        
        current_text = text
        iteration = 1
        
        while count_tokens(current_text) > target_length:
            print(f"Summarization iteration {iteration}...")
            
            # Calculate compression ratio needed
            current_tokens = count_tokens(current_text)
            compression_ratio = target_length / current_tokens
            
            prompt = f"""
Summarize this text to approximately {int(current_tokens * compression_ratio)} tokens:

{current_text}

Maintain key information and structure while reducing length.
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a summarization expert."},
                    {"role": "user", "content": prompt}
                ],
                stream=True
            )
            
            current_text = ""
            for chunk in response:
                if chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    current_text += content
                    print(content, end="", flush=True)
            
            print(f"\nIteration {iteration} complete. Tokens: {count_tokens(current_text)}")
            iteration += 1
            
            if iteration > 5:  # Prevent infinite loops
                break
        
        return current_text

# Example usage
streaming_processor = StreamingProcessor(client)

# Stream analysis
print("Streaming analysis:")
analysis = streaming_processor.stream_long_text_analysis(
    long_document,
    "thematic"
)

# Progressive summarization
print("\nProgressive summarization:")
final_summary = streaming_processor.progressive_summarization(
    long_document,
    target_length=300
)

Batch Processing

python
class BatchProcessor:
    """Process multiple long documents efficiently"""
    
    def __init__(self, client):
        self.client = client
        self.chunker = SmartChunker()
    
    def create_batch_file(self, documents: List[Dict[str, str]], 
                         output_file: str = "batch_requests.jsonl"):
        """Create batch file for multiple document processing"""
        
        batch_requests = []
        
        for doc_id, doc_data in enumerate(documents):
            document = doc_data["content"]
            task = doc_data.get("task", "Analyze this document")
            
            # Chunk document if too long
            if count_tokens(document) > 100000:  # 100K token limit for batch
                chunks = self.chunker.chunk_by_paragraphs(document)
                
                for chunk_id, chunk in enumerate(chunks):
                    request = {
                        "custom_id": f"doc_{doc_id}_chunk_{chunk_id}",
                        "method": "POST",
                        "url": "/v1/chat/completions",
                        "body": {
                            "model": "deepseek-chat",
                            "messages": [
                                {"role": "system", "content": "You are a document analysis assistant."},
                                {"role": "user", "content": f"{task}\n\nDocument chunk:\n{chunk}"}
                            ]
                        }
                    }
                    batch_requests.append(request)
            else:
                request = {
                    "custom_id": f"doc_{doc_id}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "deepseek-chat",
                        "messages": [
                            {"role": "system", "content": "You are a document analysis assistant."},
                            {"role": "user", "content": f"{task}\n\nDocument:\n{document}"}
                        ]
                    }
                }
                batch_requests.append(request)
        
        # Write to JSONL file
        with open(output_file, 'w') as f:
            for request in batch_requests:
                f.write(json.dumps(request) + '\n')
        
        return output_file, len(batch_requests)
    
    def process_batch_results(self, results_file: str) -> Dict[str, List[str]]:
        """Process batch results and group by document"""
        
        results = {}
        
        with open(results_file, 'r') as f:
            for line in f:
                result = json.loads(line)
                custom_id = result["custom_id"]
                
                # Extract document ID
                if "_chunk_" in custom_id:
                    doc_id = custom_id.split("_chunk_")[0]
                else:
                    doc_id = custom_id
                
                if doc_id not in results:
                    results[doc_id] = []
                
                content = result["response"]["body"]["choices"][0]["message"]["content"]
                results[doc_id].append(content)
        
        return results

# Example usage
batch_processor = BatchProcessor(client)

# Prepare documents for batch processing
documents = [
    {
        "content": "Long document 1 content...",
        "task": "Summarize the main points"
    },
    {
        "content": "Long document 2 content...",
        "task": "Extract key insights"
    },
    {
        "content": "Long document 3 content...",
        "task": "Identify action items"
    }
]

# Create batch file
batch_file, request_count = batch_processor.create_batch_file(documents)
print(f"Created batch file with {request_count} requests")

# Note: You would then upload this file using the batch API
# and process results when complete

Advanced Techniques

Hierarchical Processing

python
class HierarchicalProcessor:
    """Process documents using hierarchical analysis"""
    
    def __init__(self, client):
        self.client = client
        self.chunker = SmartChunker()
    
    def hierarchical_analysis(self, document: str, levels: List[str]) -> Dict[str, Any]:
        """Perform multi-level hierarchical analysis"""
        
        results = {}
        current_text = document
        
        for level in levels:
            print(f"Processing level: {level}")
            
            if level == "overview":
                # High-level overview
                results[level] = self.get_overview(current_text)
            
            elif level == "sections":
                # Section-by-section analysis
                results[level] = self.analyze_sections(current_text)
            
            elif level == "details":
                # Detailed analysis
                results[level] = self.detailed_analysis(current_text)
            
            elif level == "synthesis":
                # Synthesize all previous levels
                results[level] = self.synthesize_analysis(results)
        
        return results
    
    def get_overview(self, text: str) -> str:
        """Get high-level overview of document"""
        
        # Use first and last parts for overview
        text_tokens = count_tokens(text)
        if text_tokens > 8000:
            beginning = text[:4000]
            ending = text[-4000:]
            sample_text = beginning + "\n\n[... middle content ...]\n\n" + ending
        else:
            sample_text = text
        
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a document analysis expert."},
                {"role": "user", "content": f"""
Provide a high-level overview of this document:

{sample_text}

Include:
- Main topic and purpose
- Key themes
- Document structure
- Target audience
"""}
            ]
        )
        
        return response.choices[0].message.content
    
    def analyze_sections(self, text: str) -> List[Dict[str, str]]:
        """Analyze document section by section"""
        
        chunks = self.chunker.chunk_by_structure(text)
        section_analyses = []
        
        for i, chunk in enumerate(chunks):
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a section analysis expert."},
                    {"role": "user", "content": f"""
Analyze this document section:

{chunk}

Provide:
- Section summary
- Key points
- Important details
- Relationship to overall document
"""}
                ]
            )
            
            section_analyses.append({
                "section_id": i + 1,
                "content_preview": chunk[:200] + "...",
                "analysis": response.choices[0].message.content
            })
        
        return section_analyses
    
    def detailed_analysis(self, text: str) -> Dict[str, Any]:
        """Perform detailed analysis of specific aspects"""
        
        aspects = [
            "arguments_and_evidence",
            "methodology",
            "conclusions",
            "limitations",
            "implications"
        ]
        
        detailed_results = {}
        
        for aspect in aspects:
            prompt = f"""
Analyze the {aspect.replace('_', ' ')} in this document:

{text[:8000]}...

Focus specifically on {aspect.replace('_', ' ')} and provide detailed insights.
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": f"You are an expert in analyzing {aspect.replace('_', ' ')}."},
                    {"role": "user", "content": prompt}
                ]
            )
            
            detailed_results[aspect] = response.choices[0].message.content
        
        return detailed_results
    
    def synthesize_analysis(self, previous_results: Dict[str, Any]) -> str:
        """Synthesize all previous analysis levels"""
        
        synthesis_input = ""
        for level, result in previous_results.items():
            if level != "synthesis":  # Don't include synthesis in synthesis
                synthesis_input += f"\n\n{level.upper()} ANALYSIS:\n{str(result)[:1000]}..."
        
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a synthesis expert who combines multiple analyses."},
                {"role": "user", "content": f"""
Synthesize these multiple levels of analysis into a comprehensive understanding:

{synthesis_input}

Provide:
- Integrated insights
- Cross-level connections
- Overall assessment
- Key takeaways
"""}
            ]
        )
        
        return response.choices[0].message.content

# Example usage
hierarchical_processor = HierarchicalProcessor(client)

analysis_levels = ["overview", "sections", "details", "synthesis"]
hierarchical_results = hierarchical_processor.hierarchical_analysis(
    long_document,
    analysis_levels
)

for level, result in hierarchical_results.items():
    print(f"\n{level.upper()} ANALYSIS:")
    print("=" * 50)
    print(result)

Best Practices

Optimization Guidelines

  1. Choose appropriate chunking: Use semantic boundaries when possible
  2. Preserve context: Maintain overlap between chunks for coherence
  3. Monitor token usage: Stay within model limits for optimal performance
  4. Use streaming: For real-time processing of long texts
  5. Implement caching: Cache results for repeated processing

Performance Tips

python
# ✅ Good: Smart chunking with context preservation
chunker = SmartChunker(max_chunk_size=6000, overlap_size=300)
chunks = chunker.chunk_by_paragraphs(document)

# ✅ Good: Monitor token usage
context_info = check_context_limit(messages)
if not context_info["within_limit"]:
    # Handle oversized input

# ✅ Good: Use appropriate processing strategy
if count_tokens(document) > 50000:
    # Use parallel processing for very long documents
    results = parallel_processor.run_parallel_processing(document, task)
else:
    # Use sequential processing for moderate length
    results = processor.process_document_sequentially(document, task)

# ❌ Bad: No chunking for long documents
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role": "user", "content": very_long_document}]  # May exceed limits
)

# ❌ Bad: Ignoring token limits
# Not checking if input fits within context window

Troubleshooting

Common Issues

  1. Context length exceeded: Implement proper chunking
  2. Loss of coherence: Use overlapping chunks and context preservation
  3. Slow processing: Consider parallel processing for large documents
  4. Memory issues: Use streaming for very large texts

Debug Tools

python
def debug_long_text_processing(text: str, max_tokens: int = 128000):
    """Debug long text processing issues"""
    
    print(f"Text length: {len(text)} characters")
    print(f"Token count: {count_tokens(text)}")
    print(f"Within limit: {count_tokens(text) <= max_tokens}")
    
    if count_tokens(text) > max_tokens:
        chunker = SmartChunker()
        chunks = chunker.chunk_by_paragraphs(text)
        print(f"Suggested chunks: {len(chunks)}")
        
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"Chunk {i+1} tokens: {count_tokens(chunk)}")
    
    return {
        "needs_chunking": count_tokens(text) > max_tokens,
        "suggested_chunks": len(chunker.chunk_by_paragraphs(text)) if count_tokens(text) > max_tokens else 1
    }

# Usage
debug_info = debug_long_text_processing(long_document)
print("Debug info:", debug_info)

Next Steps

基于 DeepSeek AI 大模型技术