Quick Start: HuggingFace Hub + HF-Inferoxy
Get started with managed HuggingFace API tokens in under 5 minutes.
Setup
- Install dependencies:
uv add huggingface-hub requests
-
Create token utility file:
Save this as
hf_token_utils.py
:import os import requests from typing import Tuple def get_proxy_token(proxy_url: str = "http://localhost:8000", api_key: str = None) -> Tuple[str, str]: response = requests.get(f"{proxy_url}/keys/provision", headers={"Authorization": f"Bearer {api_key}"} if api_key else {}) if response.status_code != 200: raise Exception(f"Failed to get token: {response.text}") data = response.json() return data["token"], data["token_id"] def report_token_status(token_id: str, status: str = "success", error: str = None, proxy_url: str = "http://localhost:8000", api_key: str = None, client_name: str | None = None) -> bool: payload = {"token_id": token_id, "status": status} if error: payload["error"] = error if "401 Client Error" in error: payload["error_type"] = "invalid_credentials" elif "402 Client Error" in error and "exceeded your monthly included credits" in error: payload["error_type"] = "credits_exceeded" if client_name: payload["client_name"] = client_name headers = {"Content-Type": "application/json"} if api_key: headers["Authorization"] = f"Bearer {api_key}" try: response = requests.post(f"{proxy_url}/keys/report", json=payload, headers=headers) return response.status_code == 200 except: return False
⚠️ Important: Authentication Required
All client operations now require authentication with the HF-Inferoxy server. This is part of the Role-Based Access Control (RBAC) system.
Getting Your API Key
-
Default Admin User: The system creates a default admin user on first run. Check your server logs or the
users.json
file for the default admin credentials. - Create a User Account: Use the admin account to create a regular user account:
curl -X POST "http://localhost:8000/admin/users" \ -H "Authorization: Bearer ADMIN_API_KEY" \ -H "Content-Type: application/json" \ -d '{"username": "youruser", "email": "user@example.com", "full_name": "Your Name", "role": "user"}'
- Use the Generated API Key: The response will include an API key that you’ll use in all client operations.
For detailed RBAC setup, see RBAC_README.md.
Basic Usage
End-user tracking (optional)
Include an end-user identifier in reports via client_name
. If omitted, the server defaults to the authenticated username.
end_user = "customer_123" # from your app's user context
token, token_id = get_proxy_token(api_key=proxy_api_key)
# on success
report_token_status(token_id, "success", api_key=proxy_api_key, client_name=end_user)
# on error
report_token_status(token_id, "error", str(e), api_key=proxy_api_key, client_name=end_user)
Chat Completion
With HF-Inference (Default)
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from hf_token_utils import get_proxy_token, report_token_status
# Get managed token (requires authentication)
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
token, token_id = get_proxy_token(api_key=proxy_api_key)
# Create client
client = InferenceClient(provider="hf-inference", api_key=token)
try:
# Make request
completion = client.chat.completions.create(
model="HuggingFaceTB/SmolLM3-3B",
messages=[{"role": "user", "content": "Hello!"}]
)
# Report success
report_token_status(token_id, api_key=proxy_api_key)
print(completion.choices[0].message.content)
except HfHubHTTPError as e:
# Report error
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
With Other Providers
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from hf_token_utils import get_proxy_token, report_token_status
# Get managed token (requires authentication)
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
token, token_id = get_proxy_token(api_key=proxy_api_key)
# Choose your provider
providers = {
"cerebras": "openai/gpt-oss-120b",
"cohere": "CohereLabs/c4ai-command-r-plus",
"groq": "openai/gpt-oss-120b",
"together": "openai/gpt-oss-120b"
}
# Example with Cerebras
client = InferenceClient(provider="cerebras", api_key=token)
try:
completion = client.chat.completions.create(
model=providers["cerebras"],
messages=[{"role": "user", "content": "Hello!"}]
)
report_token_status(token_id, api_key=proxy_api_key)
print(completion.choices[0].message.content)
except HfHubHTTPError as e:
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
Feature Extraction
from huggingface_hub import InferenceClient
from hf_token_utils import get_proxy_token, report_token_status
# Get managed token (requires authentication)
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
token, token_id = get_proxy_token(api_key=proxy_api_key)
client = InferenceClient(provider="hf-inference", api_key=token)
try:
# Extract features
embeddings = client.feature_extraction(
"Hello world",
model="intfloat/multilingual-e5-large"
)
# Report success
report_token_status(token_id, api_key=proxy_api_key)
print(f"Got embeddings with shape: {len(embeddings)}")
except Exception as e:
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
Vision-Language Models (VLM)
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from hf_token_utils import get_proxy_token, report_token_status
# Get managed token (requires authentication)
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
token, token_id = get_proxy_token(api_key=proxy_api_key)
# Use a VLM-capable provider
client = InferenceClient(provider="cerebras", api_key=token)
try:
completion = client.chat.completions.create(
model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in one sentence."},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.jpg"}
}
]
}
]
)
report_token_status(token_id, api_key=proxy_api_key)
print(completion.choices[0].message.content)
except HfHubHTTPError as e:
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
Image Generation
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from hf_token_utils import get_proxy_token, report_token_status
# Get managed token (requires authentication)
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
token, token_id = get_proxy_token(api_key=proxy_api_key)
# Use an image generation provider
client = InferenceClient(provider="fal-ai", api_key=token)
try:
# Generate image
image = client.text_to_image(
"Astronaut riding a horse",
model="Qwen/Qwen-Image"
)
# Report success
report_token_status(token_id, api_key=proxy_api_key)
# Save the image
image.save("astronaut_horse.png")
print("Image generated successfully!")
except HfHubHTTPError as e:
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
With Auto-Retry
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from hf_token_utils import get_proxy_token, report_token_status
def chat_with_retry(message: str, provider: str = "hf-inference", max_retries: int = 2, proxy_api_key: str = None):
for attempt in range(max_retries + 1):
# Get fresh token for each attempt
token, token_id = get_proxy_token(api_key=proxy_api_key)
client = InferenceClient(provider=provider, api_key=token)
try:
# Choose model based on provider
model = "HuggingFaceTB/SmolLM3-3B" if provider == "hf-inference" else "openai/gpt-oss-120b"
completion = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": message}]
)
# Report success
report_token_status(token_id, api_key=proxy_api_key)
return completion.choices[0].message.content
except HfHubHTTPError as e:
error_str = str(e)
report_token_status(token_id, "error", error_str, api_key=proxy_api_key)
# Retry on auth errors, otherwise raise
if attempt < max_retries and ("401 Client Error" in error_str or "402 Client Error" in error_str):
print(f"Token error on attempt {attempt + 1}, retrying...")
continue
else:
raise
except Exception as e:
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
raise Exception("All retries failed")
# Usage with different providers
# You need to get your API key from the admin or create a user account
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
response = chat_with_retry("What is machine learning?", provider="cerebras", proxy_api_key=proxy_api_key)
print(response)
Context Manager Pattern
from contextlib import contextmanager
from huggingface_hub import InferenceClient
from hf_token_utils import get_proxy_token, report_token_status
@contextmanager
def managed_client(provider: str = "hf-inference", proxy_api_key: str = None):
token, token_id = get_proxy_token(api_key=proxy_api_key)
client = InferenceClient(provider=provider, api_key=token)
try:
yield client
report_token_status(token_id, api_key=proxy_api_key) # Success
except Exception as e:
report_token_status(token_id, "error", str(e), api_key=proxy_api_key)
raise
# Usage with different providers
# You need to get your API key from the admin or create a user account
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
with managed_client("cerebras", proxy_api_key) as client:
result = client.chat.completions.create(
model="openai/gpt-oss-120b",
messages=[{"role": "user", "content": "Hello"}]
)
print(result.choices[0].message.content)
with managed_client("hf-inference", proxy_api_key) as client:
embeddings = client.feature_extraction("Hello", model="intfloat/multilingual-e5-large")
print(f"Embedding length: {len(embeddings)}")
Available Providers
HF-Inferoxy supports these providers with comprehensive examples:
Provider | Capabilities | Example File |
---|---|---|
Cerebras | LLM, VLM, Text Generation | cerebras.md |
Cohere | LLM, VLM, Advanced Language Models | cohere.md |
Fal-AI | Speech, Image, Video Generation | fal-ai.md |
Featherless AI | LLM, VLM, Text Generation | featherless-ai.md |
Fireworks AI | LLM, VLM, Fast Inference | fireworks-ai.md |
Groq | LLM, VLM, Ultra-fast Inference | groq.md |
HF-Inference | All Tasks, Core API | hf-inference.md |
Hyperbolic | LLM, VLM, Vision AI | hyperbolic.md |
Nebius | LLM, VLM, Cloud AI Services | nebius.md |
Novita | LLM, VLM, Video Generation | novita.md |
NScale | LLM, VLM, Image Generation | nscale.md |
Replicate | Image, Video Generation | replicate.md |
SambaNova | LLM, VLM, Enterprise AI | sambanova.md |
Together | LLM, VLM, Collaborative AI | together.md |
Testing Your Setup
Create test_setup.py
:
#!/usr/bin/env python3
from hf_token_utils import get_proxy_token, report_token_status
def test_connection(proxy_api_key: str):
try:
# Test token provisioning
token, token_id = get_proxy_token(api_key=proxy_api_key)
print(f"✅ Got token: {token_id}")
# Test status reporting
success = report_token_status(token_id, "success", api_key=proxy_api_key)
print(f"✅ Reported status: {success}")
return True
except Exception as e:
print(f"❌ Error: {e}")
return False
if __name__ == "__main__":
print("Testing HF-Inferoxy connection...")
# You need to get your API key from the admin or create a user account
proxy_api_key = "your_proxy_api_key_here" # Get this from admin
test_connection(proxy_api_key)
Run with: uv run test_setup.py
Next Steps
- Browse provider examples: Check provider examples for specific use cases
- Read comprehensive guide: See huggingface_hub_integration.md for advanced patterns
- Check server setup: See main README for HF-Inferoxy server configuration
Common Issues
- Connection refused: Ensure HF-Inferoxy server is running
- No valid keys: Add API keys to the proxy server first
- Import errors: Install dependencies with
uv add huggingface-hub requests
- Provider not found: Check provider name spelling and availability
API Reference
get_proxy_token(proxy_url)
→(token, token_id)
report_token_status(token_id, status, error, proxy_url)
→bool
That’s it! You’re now using intelligent token management with HuggingFace Hub across all supported providers.