Documentation Index Fetch the complete documentation index at: https://veniceai-feat-rag-bot-article.mintlify.app/llms.txt
Use this file to discover all available pages before exploring further.
Venice AI works seamlessly with LangChain thanks to full OpenAI SDK compatibility. Build chains, agents, and RAG pipelines with Venice’s privacy-first infrastructure.
Setup
pip install langchain langchain-openai openai
Chat Models
Use ChatOpenAI with Venice’s base URL:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
model = "venice-uncensored-1-2" ,
api_key = "your-venice-api-key" ,
base_url = "https://api.venice.ai/api/v1" ,
temperature = 0.7 ,
)
response = llm.invoke( "Explain privacy-preserving AI in 2 sentences." )
print (response.content)
Streaming
for chunk in llm.stream( "Write a haiku about decentralization." ):
print (chunk.content, end = "" , flush = True )
Embeddings
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
model = "text-embedding-bge-m3" ,
api_key = "your-venice-api-key" ,
base_url = "https://api.venice.ai/api/v1" ,
check_embedding_ctx_length = False , # Required for Venice
)
vectors = embeddings.embed_documents([
"Venice AI provides private inference." ,
"No data is retained after processing." ,
])
print ( f "Embedding dimension: { len (vectors[ 0 ]) } " )
Chains
Simple Chain with Prompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
( "system" , "You are a {role} . Answer concisely." ),
( "user" , " {question} " ),
])
chain = prompt | llm
response = chain.invoke({ "role" : "privacy expert" , "question" : "Why does zero data retention matter?" })
print (response.content)
Sequential Chain
from langchain_core.output_parsers import StrOutputParser
# Chain 1: Generate a topic summary
summarizer = ChatPromptTemplate.from_messages([
( "user" , "Summarize this topic in 3 bullet points: {topic} " )
]) | llm | StrOutputParser()
# Chain 2: Generate questions from summary
questioner = ChatPromptTemplate.from_messages([
( "user" , "Based on this summary, generate 3 thought-provoking questions: \n {summary} " )
]) | llm | StrOutputParser()
# Compose
summary = summarizer.invoke({ "topic" : "decentralized AI inference" })
questions = questioner.invoke({ "summary" : summary})
print (questions)
RAG Pipeline
Build a retrieval-augmented generation pipeline with Venice:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
# Initialize Venice models
llm = ChatOpenAI(
model = "zai-org-glm-5-1" ,
api_key = "your-venice-api-key" ,
base_url = "https://api.venice.ai/api/v1" ,
)
embeddings = OpenAIEmbeddings(
model = "text-embedding-bge-m3" ,
api_key = "your-venice-api-key" ,
base_url = "https://api.venice.ai/api/v1" ,
check_embedding_ctx_length = False ,
)
# Load and split documents
documents = [
"Venice AI provides private, uncensored AI inference with zero data retention." ,
"The Venice API is OpenAI-compatible, supporting chat completions, images, audio, video, and embeddings." ,
"Venice supports function calling, structured outputs, web search, and reasoning models." ,
"Privacy levels include Private (zero retention) and Anonymized (third-party processed)." ,
]
# Create vector store
vectorstore = FAISS .from_texts(documents, embeddings)
retriever = vectorstore.as_retriever( search_kwargs = { "k" : 2 })
# RAG prompt
rag_prompt = ChatPromptTemplate.from_messages([
( "system" , "Answer the question based only on the following context: \n\n {context} " ),
( "user" , " {question} " ),
])
# RAG chain
def format_docs ( docs ):
return " \n\n " .join(doc.page_content for doc in docs)
rag_chain = (
{ "context" : retriever | format_docs, "question" : RunnablePassthrough()}
| rag_prompt
| llm
| StrOutputParser()
)
answer = rag_chain.invoke( "What privacy levels does Venice offer?" )
print (answer)
Function Calling with Agents
from langchain_core.tools import tool
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate
# Use a function-calling capable model
llm = ChatOpenAI(
model = "zai-org-glm-5-1" ,
api_key = "your-venice-api-key" ,
base_url = "https://api.venice.ai/api/v1" ,
)
@tool
def get_venice_model_price ( model_id : str ) -> str :
"""Get the pricing for a Venice AI model."""
prices = {
"venice-uncensored-1-2" : "Input: $0.20/1M, Output: $0.90/1M" ,
"zai-org-glm-5-1" : "Input: $1.75/1M, Output: $5.50/1M" ,
"qwen3-5-9b" : "Input: $0.10/1M, Output: $0.15/1M" ,
}
return prices.get(model_id, f "Model { model_id } not found in price list." )
prompt = ChatPromptTemplate.from_messages([
( "system" , "You help users find the right Venice AI model. Use tools when needed." ),
( "placeholder" , " {chat_history} " ),
( "user" , " {input} " ),
( "placeholder" , " {agent_scratchpad} " ),
])
agent = create_tool_calling_agent(llm, [get_venice_model_price], prompt)
executor = AgentExecutor( agent = agent, tools = [get_venice_model_price], verbose = True )
result = executor.invoke({ "input" : "What's the cheapest Venice text model?" , "chat_history" : []})
print (result[ "output" ])
Structured Output
from pydantic import BaseModel, Field
class MovieReview ( BaseModel ):
title: str = Field( description = "Movie title" )
rating: float = Field( description = "Rating out of 10" )
summary: str = Field( description = "One-sentence summary" )
structured_llm = llm.with_structured_output(MovieReview)
review = structured_llm.invoke( "Review the movie Inception" )
print ( f " { review.title } : { review.rating } /10 — { review.summary } " )
Web Search Integration
Use Venice’s built-in web search via venice_parameters:
from langchain_openai import ChatOpenAI
llm_with_search = ChatOpenAI(
model = "venice-uncensored" ,
api_key = "your-venice-api-key" ,
base_url = "https://api.venice.ai/api/v1" ,
extra_body = {
"venice_parameters" : {
"enable_web_search" : "auto"
}
}
)
response = llm_with_search.invoke( "What are the latest developments in AI this week?" )
print (response.content)
Or pass it per-request:
response = llm.invoke(
"What are the latest developments in AI this week?" ,
extra_body = { "venice_parameters" : { "enable_web_search" : "auto" }}
)
Recommended Models for LangChain
Use Case Model Why General chains venice-uncensoredFast, cheap, uncensored Complex reasoning zai-org-glm-5-1Best private flagship model Function calling zai-org-glm-5-1Reliable tool use Vision + text qwen3-vl-235b-a22bAdvanced vision understanding Code generation qwen3-coder-480b-a35b-instructOptimized for code Embeddings (RAG) text-embedding-bge-m3Private embeddings Budget / high-volume qwen3-5-9b$0.10/1M input
View All Models Browse all Venice models with pricing and capabilities