Overview

Provider Type

Local

API Endpoint


                                                        http://localhost:8080/v1

Free Tier Highlights

Hardware dependent

Quick Start Guide

1

Download a .llamafile from HuggingFace

2

Open terminal

3

Run ./model.llamafile

4

Open browser at localhost:8080

Available Models

Model Name	ID	Context	Capabilities
LLaVA 1.5 Free	`llava-1.5-7b-q4`	Local	Vision
Mistral 7B Free	`mistral-7b-instruct-v0.2.Q4_K_M`	Local	-
TinyLlama Free	`tinyllama-1.1b-chat-v1.0.Q8_0`	Local	-

Integration Examples

Ready-to-use code snippets for your applications.

Select a Model

Select Model

main.py

from openai import OpenAI

# Run: ./model.llamafile --server
client = OpenAI(
    api_key="llamafile",
    base_url="http://localhost:8080/v1"
)

response = client.chat.completions.create(
    model="local",
    messages=[
        {"role": "user", "content": "What is llamafile?"}
    ]
)

print(response.choices[0].message.content)

index.js

// Run llamafile first: ./model.llamafile --server
const response = await fetch('http://localhost:8080/v1/chat/completions', {
  method: 'POST',
  headers: { 'Content-Type': 'application/json' },
  body: JSON.stringify({
    model: 'local',
    messages: [
      { role: 'user', content: 'What is llamafile?' }
    ]
  })
});

const data = await response.json();
console.log(data.choices[0].message.content);

terminal

# Run: ./model.llamafile --server
curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "local",
    "messages": [
      {
        "role": "user",
        "content": "What is llamafile?"
      }
    ]
  }'

example.php

<?php
$apiKey = 'YOUR_API_KEY';
$url = "http://localhost:8080/v1/models/llava-1.5-7b-q4:generateContent?key=$apiKey";

$data = [
    "contents" => [
        ["parts" => [["text" => "Explain quantum mechanics"]]]
    ]
];

$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);

$response = curl_exec($ch);
curl_close($ch);

echo $response;
?>

main.go

package main

import (
    "context"
    "fmt"
    "log"
    "github.com/google/generative-ai-go/genai"
    "google.golang.org/api/option"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, option.WithAPIKey("YOUR_API_KEY"))
    if err != nil { log.Fatal(err) }
    defer client.Close()

    model := client.GenerativeModel("llava-1.5-7b-q4")
    resp, err := model.GenerateContent(ctx, genai.Text("Explain quantum mechanics"))
    if err != nil { log.Fatal(err) }
    
    // ...
}

main.py

from openai import OpenAI

# Run: ./model.llamafile --server
client = OpenAI(
    api_key="llamafile",
    base_url="http://localhost:8080/v1"
)

response = client.chat.completions.create(
    model="local",
    messages=[
        {"role": "user", "content": "What is llamafile?"}
    ]
)

print(response.choices[0].message.content)

index.js

// Run llamafile first: ./model.llamafile --server
const response = await fetch('http://localhost:8080/v1/chat/completions', {
  method: 'POST',
  headers: { 'Content-Type': 'application/json' },
  body: JSON.stringify({
    model: 'local',
    messages: [
      { role: 'user', content: 'What is llamafile?' }
    ]
  })
});

const data = await response.json();
console.log(data.choices[0].message.content);

terminal

# Run: ./model.llamafile --server
curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "local",
    "messages": [
      {
        "role": "user",
        "content": "What is llamafile?"
      }
    ]
  }'

example.php

<?php
$apiKey = 'YOUR_API_KEY';
$url = "http://localhost:8080/v1/models/mistral-7b-instruct-v0.2.Q4_K_M:generateContent?key=$apiKey";

$data = [
    "contents" => [
        ["parts" => [["text" => "Explain quantum mechanics"]]]
    ]
];

$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);

$response = curl_exec($ch);
curl_close($ch);

echo $response;
?>

main.go

package main

import (
    "context"
    "fmt"
    "log"
    "github.com/google/generative-ai-go/genai"
    "google.golang.org/api/option"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, option.WithAPIKey("YOUR_API_KEY"))
    if err != nil { log.Fatal(err) }
    defer client.Close()

    model := client.GenerativeModel("mistral-7b-instruct-v0.2.Q4_K_M")
    resp, err := model.GenerateContent(ctx, genai.Text("Explain quantum mechanics"))
    if err != nil { log.Fatal(err) }
    
    // ...
}

main.py

from openai import OpenAI

# Run: ./model.llamafile --server
client = OpenAI(
    api_key="llamafile",
    base_url="http://localhost:8080/v1"
)

response = client.chat.completions.create(
    model="local",
    messages=[
        {"role": "user", "content": "What is llamafile?"}
    ]
)

print(response.choices[0].message.content)

index.js

// Run llamafile first: ./model.llamafile --server
const response = await fetch('http://localhost:8080/v1/chat/completions', {
  method: 'POST',
  headers: { 'Content-Type': 'application/json' },
  body: JSON.stringify({
    model: 'local',
    messages: [
      { role: 'user', content: 'What is llamafile?' }
    ]
  })
});

const data = await response.json();
console.log(data.choices[0].message.content);

terminal

# Run: ./model.llamafile --server
curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "local",
    "messages": [
      {
        "role": "user",
        "content": "What is llamafile?"
      }
    ]
  }'

example.php

<?php
$apiKey = 'YOUR_API_KEY';
$url = "http://localhost:8080/v1/models/tinyllama-1.1b-chat-v1.0.Q8_0:generateContent?key=$apiKey";

$data = [
    "contents" => [
        ["parts" => [["text" => "Explain quantum mechanics"]]]
    ]
];

$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);

$response = curl_exec($ch);
curl_close($ch);

echo $response;
?>

main.go

package main

import (
    "context"
    "fmt"
    "log"
    "github.com/google/generative-ai-go/genai"
    "google.golang.org/api/option"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, option.WithAPIKey("YOUR_API_KEY"))
    if err != nil { log.Fatal(err) }
    defer client.Close()

    model := client.GenerativeModel("tinyllama-1.1b-chat-v1.0.Q8_0")
    resp, err := model.GenerateContent(ctx, genai.Text("Explain quantum mechanics"))
    if err != nil { log.Fatal(err) }
    
    // ...
}

Free Tier Pricing & Limits

Rate Limit

Requests per minute

Hardware dependent

Daily Quota

Requests per day

Unlimited

Token Limit

Tokens per minute

Unlimited

Monthly Quota

Per month limit

Free Open Source

Use Cases

Sharing models easily

Archiving models

Quick local testing

Education/Demos

Limitations & Considerations

File sizes are large (contain weights)

CLI usage often required

Windows requires appending .exe

Beta software

Community Hub

Live

Join the discussion, share tips, and rate llamafile.

Quick Reactions

Add Discussion

Display Name

Comment

Comments are moderated. Be helpful and respectful.

Recent Activity

0 comments

Cookie Consent

llamafile

Overview

Provider Type

API Endpoint

Free Tier Highlights

Why Choose llamafile?

Quick Start Guide

Download a .llamafile from HuggingFace

Open terminal

Run ./model.llamafile

Open browser at localhost:8080

Available Models

Integration Examples

Select Model

Free Tier Pricing & Limits

Rate Limit

Daily Quota

Token Limit

Monthly Quota

Use Cases

Limitations & Considerations

Community Hub

Quick Reactions

Add Discussion

Recent Activity

Ready to Get Started?

Cookie Consent

llamafile

Overview

Provider Type

API Endpoint

Free Tier Highlights

Why Choose llamafile?

Quick Start Guide

Download a .llamafile from HuggingFace

Open terminal

Run ./model.llamafile

Open browser at localhost:8080

Available Models

Integration Examples

Select Model

Free Tier Pricing & Limits

Rate Limit

Daily Quota

Token Limit

Monthly Quota

Use Cases

Limitations & Considerations

Community Hub

Quick Reactions

Add Discussion

Recent Activity

Suggest an Edit

Ready to Get Started?