Overview

Provider Type

API

API Endpoint


                                                        http://localhost:3000

Free Tier Highlights

Hardware dependent

Quick Start Guide

1

pip install bentoml

2

Define service in service.py

3

bentoml build

4

bentoml serve

5

Containerize with 'bentoml containerize'

Available Models

Model Name	ID	Context	Capabilities
Llama 3 8B Instruct	`bentoml/llama-3-8b-instruct`	8 000	-
OpenLLM Generic	`bentoml/openllm`	Varies	-

Integration Examples

Ready-to-use code snippets for your applications.

Select a Model

Select Model

main.py

# service.py - Define your BentoML service
import bentoml

@bentoml.service
class LLMService:
    def __init__(self):
        import vllm
        self.llm = vllm.LLM(model="meta-llama/Llama-3-8B-Instruct")

    @bentoml.api
    def generate(self, prompt: str) -> str:
        from vllm import SamplingParams
        params = SamplingParams(max_tokens=512)
        output = self.llm.generate([prompt], params)
        return output[0].outputs[0].text

# Run: bentoml serve service:LLMService

index.js

// Call your deployed BentoML service
const response = await fetch('http://localhost:3000/generate', {
  method: 'POST',
  headers: {
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    prompt: 'Explain BentoML serving'
  })
});

const data = await response.json();
console.log(data);

terminal

# Serve locally first: bentoml serve service:LLMService
curl http://localhost:3000/generate \
  -H "Content-Type: application/json" \
  -d '{
    "prompt": "Explain BentoML serving"
  }'

example.php

<?php
$apiKey = 'YOUR_API_KEY';
$url = "http://localhost:3000/models/bentoml/llama-3-8b-instruct:generateContent?key=$apiKey";

$data = [
    "contents" => [
        ["parts" => [["text" => "Explain quantum mechanics"]]]
    ]
];

$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);

$response = curl_exec($ch);
curl_close($ch);

echo $response;
?>

main.go

package main

import (
    "context"
    "fmt"
    "log"
    "github.com/google/generative-ai-go/genai"
    "google.golang.org/api/option"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, option.WithAPIKey("YOUR_API_KEY"))
    if err != nil { log.Fatal(err) }
    defer client.Close()

    model := client.GenerativeModel("bentoml/llama-3-8b-instruct")
    resp, err := model.GenerateContent(ctx, genai.Text("Explain quantum mechanics"))
    if err != nil { log.Fatal(err) }
    
    // ...
}

main.py

# service.py - Define your BentoML service
import bentoml

@bentoml.service
class LLMService:
    def __init__(self):
        import vllm
        self.llm = vllm.LLM(model="meta-llama/Llama-3-8B-Instruct")

    @bentoml.api
    def generate(self, prompt: str) -> str:
        from vllm import SamplingParams
        params = SamplingParams(max_tokens=512)
        output = self.llm.generate([prompt], params)
        return output[0].outputs[0].text

# Run: bentoml serve service:LLMService

index.js

// Call your deployed BentoML service
const response = await fetch('http://localhost:3000/generate', {
  method: 'POST',
  headers: {
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    prompt: 'Explain BentoML serving'
  })
});

const data = await response.json();
console.log(data);

terminal

# Serve locally first: bentoml serve service:LLMService
curl http://localhost:3000/generate \
  -H "Content-Type: application/json" \
  -d '{
    "prompt": "Explain BentoML serving"
  }'

example.php

<?php
$apiKey = 'YOUR_API_KEY';
$url = "http://localhost:3000/models/bentoml/openllm:generateContent?key=$apiKey";

$data = [
    "contents" => [
        ["parts" => [["text" => "Explain quantum mechanics"]]]
    ]
];

$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);

$response = curl_exec($ch);
curl_close($ch);

echo $response;
?>

main.go

package main

import (
    "context"
    "fmt"
    "log"
    "github.com/google/generative-ai-go/genai"
    "google.golang.org/api/option"
)

func main() {
    ctx := context.Background()
    client, err := genai.NewClient(ctx, option.WithAPIKey("YOUR_API_KEY"))
    if err != nil { log.Fatal(err) }
    defer client.Close()

    model := client.GenerativeModel("bentoml/openllm")
    resp, err := model.GenerateContent(ctx, genai.Text("Explain quantum mechanics"))
    if err != nil { log.Fatal(err) }
    
    // ...
}

Free Tier Pricing & Limits

Rate Limit

Requests per minute

Hardware dependent

Daily Quota

Requests per day

Unlimited

Token Limit

Tokens per minute

Unlimited

Monthly Quota

Per month limit

Free Open Source

Use Cases

Standardizing ML deployment

Serving LLMs with OpenLLM

Hybrid cloud deployments

CI/CD for ML models

Running Inference at Scale

Deploying Any Model Anywhere

Optimizing AI Inference Performance and Cost

Managing & Monitoring AI Model Inference

Interactive AI Applications (chatbots, recommendations)

Asynchronous Long-Running AI Tasks

Large-Scale Batch AI Inference

Orchestrating Complex AI Workflows (RAG, Compound AI Systems)

Enterprise Mission-Critical AI Deployments

Limitations & Considerations

Learning curve for 'Bento' concept

Deployment requires cloud knowledge

Local serving is just step 1

Configuration overhead

Community Hub

Live

Join the discussion, share tips, and rate BentoML.

Quick Reactions

Add Discussion

Display Name

Comment

Comments are moderated. Be helpful and respectful.

Recent Activity

0 comments

Cookie Consent

BentoML

Overview

Provider Type

API Endpoint

Free Tier Highlights

Why Choose BentoML?

Quick Start Guide

pip install bentoml

Define service in service.py

bentoml build

bentoml serve

Containerize with 'bentoml containerize'

Available Models

Integration Examples

Select Model

Free Tier Pricing & Limits

Rate Limit

Daily Quota

Token Limit

Monthly Quota

Use Cases

Limitations & Considerations

Community Hub

Quick Reactions

Add Discussion

Recent Activity

Ready to Get Started?

Cookie Consent

BentoML

Overview

Provider Type

API Endpoint

Free Tier Highlights

Why Choose BentoML?

Quick Start Guide

pip install bentoml

Define service in service.py

bentoml build

bentoml serve

Containerize with 'bentoml containerize'

Available Models

Integration Examples

Select Model

Free Tier Pricing & Limits

Rate Limit

Daily Quota

Token Limit

Monthly Quota

Use Cases

Limitations & Considerations

Community Hub

Quick Reactions

Add Discussion

Recent Activity

Suggest an Edit

Ready to Get Started?