10 KiB
Claude API — Python
Installation
```bash pip install anthropic ```
Client Initialization
```python import anthropic
Default (uses ANTHROPIC_API_KEY env var)
client = anthropic.Anthropic()
Explicit API key
client = anthropic.Anthropic(api_key="your-api-key")
Async client
async_client = anthropic.AsyncAnthropic() ```
Basic Message Request
```python response = client.messages.create( model="claude-opus-4-6", max_tokens=1024, messages=[ {"role": "user", "content": "What is the capital of France?"} ] ) print(response.content[0].text) ```
System Prompts
```python response = client.messages.create( model="claude-opus-4-6", max_tokens=1024, system="You are a helpful coding assistant. Always provide examples in Python.", messages=[{"role": "user", "content": "How do I read a JSON file?"}] ) ```
Vision (Images)
Base64
```python import base64
with open("image.png", "rb") as f: image_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = client.messages.create( model="claude-opus-4-6", max_tokens=1024, messages=[{ "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": image_data } }, {"type": "text", "text": "What's in this image?"} ] }] ) ```
URL
```python response = client.messages.create( model="claude-opus-4-6", max_tokens=1024, messages=[{ "role": "user", "content": [ { "type": "image", "source": { "type": "url", "url": "https://example.com/image.png" } }, {"type": "text", "text": "Describe this image"} ] }] ) ```
Prompt Caching
Cache large context to reduce costs (up to 90% savings).
```python response = client.messages.create( model="claude-opus-4-6", max_tokens=1024, system=[{ "type": "text", "text": "You are an expert on this large document...", "cache_control": {"type": "ephemeral"} # default TTL is 5 minutes }], messages=[{"role": "user", "content": "Summarize the key points"}] )
With explicit TTL (time-to-live)
response = client.messages.create( model="claude-opus-4-6", max_tokens=1024, system=[{ "type": "text", "text": "You are an expert on this large document...", "cache_control": {"type": "ephemeral", "ttl": "1h"} # 1 hour TTL }], messages=[{"role": "user", "content": "Summarize the key points"}] ) ```
Extended Thinking
Opus 4.6 and Sonnet 4.6: Use adaptive thinking. `budget_tokens` is deprecated on both Opus 4.6 and Sonnet 4.6. Older models: Use `thinking: {type: "enabled", budget_tokens: N}` (must be < `max_tokens`, min 1024).
```python
Opus 4.6: adaptive thinking (recommended)
response = client.messages.create( model="claude-opus-4-6", max_tokens=16000, thinking={"type": "adaptive"}, output_config={"effort": "high"}, # low | medium | high | max messages=[{"role": "user", "content": "Solve this step by step..."}] )
Access thinking and response
for block in response.content: if block.type == "thinking": print(f"Thinking: {block.thinking}") elif block.type == "text": print(f"Response: {block.text}") ```
Error Handling
```python import anthropic
try: response = client.messages.create(...) except anthropic.BadRequestError as e: print(f"Bad request: {e.message}") except anthropic.AuthenticationError: print("Invalid API key") except anthropic.PermissionDeniedError: print("API key lacks required permissions") except anthropic.NotFoundError: print("Invalid model or endpoint") except anthropic.RateLimitError as e: retry_after = int(e.response.headers.get("retry-after", "60")) print(f"Rate limited. Retry after {retry_after}s.") except anthropic.APIStatusError as e: if e.status_code >= 500: print(f"Server error ({e.status_code}). Retry later.") else: print(f"API error: {e.message}") except anthropic.APIConnectionError: print("Network error. Check internet connection.") ```
Multi-Turn Conversations
The API is stateless — send the full conversation history each time.
```python class ConversationManager: """Manage multi-turn conversations with the Claude API."""
def __init__(self, client: anthropic.Anthropic, model: str, system: str = None):
self.client = client
self.model = model
self.system = system
self.messages = []
def send(self, user_message: str, **kwargs) -> str:
"""Send a message and get a response."""
self.messages.append({"role": "user", "content": user_message})
response = self.client.messages.create(
model=self.model,
max_tokens=kwargs.get("max_tokens", 1024),
system=self.system,
messages=self.messages,
**kwargs
)
assistant_message = response.content[0].text
self.messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
Usage
conversation = ConversationManager( client=anthropic.Anthropic(), model="claude-opus-4-6", system="You are a helpful assistant." )
response1 = conversation.send("My name is Alice.") response2 = conversation.send("What's my name?") # Claude remembers "Alice" ```
Rules:
- Messages must alternate between `user` and `assistant`
- First message must be `user`
Compaction (long conversations)
Beta, Opus 4.6 only. When conversations approach the 200K context window, compaction automatically summarizes earlier context server-side. The API returns a `compaction` block; you must pass it back on subsequent requests — append `response.content`, not just the text.
```python import anthropic
client = anthropic.Anthropic() messages = []
def chat(user_message: str) -> str: messages.append({"role": "user", "content": user_message})
response = client.beta.messages.create(
betas=["compact-2026-01-12"],
model="claude-opus-4-6",
max_tokens=4096,
messages=messages,
context_management={
"edits": [{"type": "compact_20260112"}]
}
)
# Append full content — compaction blocks must be preserved
messages.append({"role": "assistant", "content": response.content})
return next(block.text for block in response.content if block.type == "text")
Compaction triggers automatically when context grows large
print(chat("Help me build a Python web scraper")) print(chat("Add support for JavaScript-rendered pages")) print(chat("Now add rate limiting and error handling")) ```
Stop Reasons
The `stop_reason` field in the response indicates why the model stopped generating:
| Value | Meaning |
|---|---|
| `end_turn` | Claude finished its response naturally |
| `max_tokens` | Hit the `max_tokens` limit — increase it or use streaming |
| `stop_sequence` | Hit a custom stop sequence |
| `tool_use` | Claude wants to call a tool — execute it and continue |
| `pause_turn` | Model paused and can be resumed (agentic flows) |
| `refusal` | Claude refused for safety reasons — output may not match your schema |
Cost Optimization Strategies
1. Use Prompt Caching for Repeated Context
```python
Cache large system prompts or documents
system_with_cache = [{ "type": "text", "text": large_document_text, # e.g., 50KB of context "cache_control": {"type": "ephemeral"} # add "ttl": "1h" for longer caching }]
First request: full cost
Subsequent requests: ~90% cheaper for cached portion
```
2. Choose the Right Model
```python
Default to Opus for most tasks
response = client.messages.create( model="claude-opus-4-6", # $5.00/$25.00 per 1M tokens max_tokens=1024, messages=[{"role": "user", "content": "Explain quantum computing"}] )
Use Sonnet for high-volume production workloads
standard_response = client.messages.create( model="claude-sonnet-4-6", # $3.00/$15.00 per 1M tokens max_tokens=1024, messages=[{"role": "user", "content": "Summarize this document"}] )
Use Haiku only for simple, speed-critical tasks
simple_response = client.messages.create( model="claude-haiku-4-5", # $1.00/$5.00 per 1M tokens max_tokens=256, messages=[{"role": "user", "content": "Classify this as positive or negative"}] ) ```
3. Use Token Counting Before Requests
```python count_response = client.messages.count_tokens( model="claude-opus-4-6", messages=messages, system=system )
estimated_input_cost = count_response.input_tokens * 0.000005 # $5/1M tokens print(f"Estimated input cost: ${estimated_input_cost:.4f}") ```
Retry with Exponential Backoff
Note: The Anthropic SDK automatically retries rate limit (429) and server errors (5xx) with exponential backoff. You can configure this with `max_retries` (default: 2). Only implement custom retry logic if you need behavior beyond what the SDK provides.
```python import time import random import anthropic
def call_with_retry( client: anthropic.Anthropic, max_retries: int = 5, base_delay: float = 1.0, max_delay: float = 60.0, **kwargs ): """Call the API with exponential backoff retry.""" last_exception = None
for attempt in range(max_retries):
try:
return client.messages.create(**kwargs)
except anthropic.RateLimitError as e:
last_exception = e
except anthropic.APIStatusError as e:
if e.status_code >= 500:
last_exception = e
else:
raise # Client errors (4xx except 429) should not be retried
delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
print(f"Retry {attempt + 1}/{max_retries} after {delay:.1f}s")
time.sleep(delay)
raise last_exception
```