Skip to content

Code Examples

Production-ready code samples for llcuda v1.1.0. All examples tested on GeForce 940M (1GB VRAM) and Tesla T4 (Colab/Kaggle).

Note: On first import, llcuda will auto-download binaries and models (one-time setup taking 3-5 minutes).


Basic Usage

Hello World

import llcuda

engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

result = engine.infer("What is Python?")
print(result.text)

Interactive Chat

Multi-Turn Conversation

import llcuda

engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

conversation = [
    "What is machine learning?",
    "How does it differ from AI?",
    "Give me a practical example"
]

for prompt in conversation:
    result = engine.infer(prompt, max_tokens=100)
    print(f"Q: {prompt}")
    print(f"A: {result.text}\n")

JupyterLab Integration

Data Analysis with LLM

import pandas as pd
import llcuda

# Load data
df = pd.read_csv("sales_data.csv")

# Create engine
engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

# Get data summary
summary = df.describe().to_string()

# Analyze with LLM
analysis = engine.infer(
    f"Analyze this sales data and provide insights:\n{summary}",
    max_tokens=200
)

print(analysis.text)

Batch Processing

Process Multiple Prompts

import llcuda

engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

prompts = [
    "Explain neural networks",
    "What is deep learning?",
    "Describe NLP"
]

# Batch inference (more efficient)
results = engine.batch_infer(prompts, max_tokens=80)

for prompt, result in zip(prompts, results):
    print(f"Q: {prompt}")
    print(f"A: {result.text}")
    print(f"Speed: {result.tokens_per_sec:.1f} tok/s\n")

Code Generation

Generate and Review Code

import llcuda

engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

# Generate code
code_prompt = "Write a Python function to calculate Fibonacci numbers"
code_result = engine.infer(code_prompt, max_tokens=150)
print("Generated Code:")
print(code_result.text)

# Review code
review_prompt = f"Review this code for improvements:\n{code_result.text}"
review_result = engine.infer(review_prompt, max_tokens=150)
print("\nCode Review:")
print(review_result.text)

Temperature Comparison

Experiment with Different Temperatures

import llcuda

engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

prompt = "Write a haiku about AI"
temperatures = [0.3, 0.7, 1.2]

for temp in temperatures:
    result = engine.infer(prompt, temperature=temp, max_tokens=50)
    print(f"\nTemperature {temp}:")
    print(result.text)

Performance Monitoring

Track Latency and Throughput

import llcuda

engine = llcuda.InferenceEngine()
engine.load_model("gemma-3-1b-Q4_K_M")

# Run some inferences
for i in range(20):
    engine.infer("Hello, how are you?", max_tokens=20)

# Get performance metrics
metrics = engine.get_metrics()

print("Latency Statistics:")
print(f"  Mean: {metrics['latency']['mean_ms']:.2f} ms")
print(f"  p50:  {metrics['latency']['p50_ms']:.2f} ms")
print(f"  p95:  {metrics['latency']['p95_ms']:.2f} ms")
print(f"  p99:  {metrics['latency']['p99_ms']:.2f} ms")

print("\nThroughput:")
print(f"  Total Tokens: {metrics['throughput']['total_tokens']}")
print(f"  Tokens/sec: {metrics['throughput']['tokens_per_sec']:.2f}")

Error Handling

Robust Production Code

import llcuda

try:
    engine = llcuda.InferenceEngine()
    engine.load_model("gemma-3-1b-Q4_K_M")

    result = engine.infer("What is AI?", max_tokens=100)

    if result.success:
        print(result.text)
    else:
        print(f"Inference failed: {result.error_message}")

except Exception as e:
    print(f"Error: {e}")
finally:
    if 'engine' in locals():
        engine.unload_model()

Context Manager Pattern

Automatic Resource Cleanup

import llcuda

# Use context manager for automatic cleanup
with llcuda.InferenceEngine() as engine:
    engine.load_model("gemma-3-1b-Q4_K_M")

    result = engine.infer("Explain quantum computing", max_tokens=100)
    print(result.text)

# Engine automatically cleaned up here

Using Local GGUF Files

Load Custom Models

import llcuda

engine = llcuda.InferenceEngine()

# Find local GGUF models
models = llcuda.find_gguf_models()

if models:
    # Use first model found
    engine.load_model(str(models[0]))
else:
    # Fall back to registry
    engine.load_model("gemma-3-1b-Q4_K_M")

result = engine.infer("Hello!", max_tokens=20)
print(result.text)

Production Pattern: API Wrapper

Build a Simple API

import llcuda
from typing import Dict

class LLMService:
    def __init__(self, model_name: str = "gemma-3-1b-Q4_K_M"):
        self.engine = llcuda.InferenceEngine()
        self.engine.load_model(model_name)

    def generate(self, prompt: str, max_tokens: int = 100) -> Dict:
        result = self.engine.infer(prompt, max_tokens=max_tokens)

        return {
            "text": result.text,
            "tokens": result.tokens_generated,
            "speed": result.tokens_per_sec,
            "latency_ms": result.latency_ms
        }

    def batch_generate(self, prompts: list, max_tokens: int = 100) -> list:
        results = self.engine.batch_infer(prompts, max_tokens=max_tokens)
        return [
            {
                "text": r.text,
                "tokens": r.tokens_generated,
                "speed": r.tokens_per_sec
            }
            for r in results
        ]

    def get_stats(self) -> Dict:
        return self.engine.get_metrics()

    def cleanup(self):
        self.engine.unload_model()

# Usage
service = LLMService()
response = service.generate("What is AI?")
print(response)

Complete JupyterLab Example

See the full JupyterLab notebook with:

  • System info checks
  • Model registry listing
  • Batch inference
  • Performance visualization
  • Context manager usage
  • Temperature comparisons

Next Steps