local-ai: 2.12.4 -> 2.13.0

This commit is contained in:
Christian Kögler 2024-04-26 19:40:55 +02:00
parent dfd38d2388
commit 729264e1d1
4 changed files with 311 additions and 129 deletions

View File

@ -0,0 +1,30 @@
{ lib
, writers
, writeText
, linkFarmFromDrvs
}: {
genModels = configs:
let
name = lib.strings.sanitizeDerivationName
(builtins.concatStringsSep "_" ([ "local-ai-models" ] ++ (builtins.attrNames configs)));
genModelFiles = name: config:
let
templateName = type: name + "_" + type;
config' = lib.recursiveUpdate config ({
inherit name;
} // lib.optionalAttrs (lib.isDerivation config.parameters.model) {
parameters.model = config.parameters.model.name;
} // lib.optionalAttrs (config ? template) {
template = builtins.mapAttrs (n: _: templateName n) config.template;
});
in
[ (writers.writeYAML "${name}.yaml" config') ]
++ lib.optional (lib.isDerivation config.parameters.model)
config.parameters.model
++ lib.optionals (config ? template)
(lib.mapAttrsToList (n: writeText "${templateName n}.tmpl") config.template);
in
linkFarmFromDrvs name (lib.flatten (lib.mapAttrsToList genModelFiles configs));
}

View File

@ -0,0 +1,56 @@
{ pkgs, config, lib, ... }:
let
cfg = config.services.local-ai;
inherit (lib) mkOption types;
in
{
options.services.local-ai = {
enable = lib.mkEnableOption "Enable service";
package = lib.mkPackageOption pkgs "local-ai" { };
extraArgs = mkOption {
type = types.listOf types.str;
default = [ ];
};
port = mkOption {
type = types.port;
default = 8080;
};
threads = mkOption {
type = types.int;
default = 1;
};
models = mkOption {
type = types.either types.package types.str;
default = "models";
};
};
config = lib.mkIf cfg.enable {
systemd.services.local-ai = {
wantedBy = [ "multi-user.target" ];
serviceConfig = {
DynamicUser = true;
ExecStart = lib.escapeShellArgs ([
"${cfg.package}/bin/local-ai"
"--debug"
"--address"
":${toString cfg.port}"
"--threads"
(toString cfg.threads)
"--localai-config-dir"
"."
"--models-path"
(toString cfg.models)
]
++ cfg.extraArgs);
RuntimeDirectory = "local-ai";
WorkingDirectory = "%t/local-ai";
};
};
};
}

View File

@ -6,6 +6,8 @@
, fetchpatch
, fetchFromGitHub
, protobuf
, protoc-gen-go
, protoc-gen-go-grpc
, grpc
, openssl
, llama-cpp
@ -61,8 +63,8 @@ let
inherit (cudaPackages) libcublas cuda_nvcc cuda_cccl cuda_cudart cudatoolkit;
go-llama-ggml = effectiveStdenv.mkDerivation {
name = "go-llama-ggml";
go-llama = effectiveStdenv.mkDerivation {
name = "go-llama";
src = fetchFromGitHub {
owner = "go-skynet";
repo = "go-llama.cpp";
@ -98,8 +100,8 @@ let
src = fetchFromGitHub {
owner = "ggerganov";
repo = "llama.cpp";
rev = "1b67731e184e27a465b8c5476061294a4af668ea";
hash = "sha256-0WWbsklpW6HhFRkvWpYh8Lhi8VIansS/zmyIKNQRkIs=";
rev = "784e11dea1f5ce9638851b2b0dddb107e2a609c8";
hash = "sha256-yAQAUo5J+a6O2kTqhFL1UH0tANxpQn3JhAd3MByaC6I=";
fetchSubmodules = true;
};
postPatch = prev.postPatch + ''
@ -252,8 +254,8 @@ let
src = fetchFromGitHub {
owner = "ggerganov";
repo = "whisper.cpp";
rev = "8f253ef3af1c62c04316ba4afa7145fc4d701a8c";
hash = "sha256-yHHjhpQIn99A/hqFwAb7TfTf4Q9KnKat93zyXS70bT8=";
rev = "858452d58dba3acdc3431c9bced2bb8cfd9bf418";
hash = "sha256-2fT3RgGpBex1mF6GJsVDo4rb0F31YqxTymsXcrpQAZk=";
};
nativeBuildInputs = [ cmake pkg-config ]
@ -371,18 +373,18 @@ let
stdenv;
pname = "local-ai";
version = "2.12.4";
version = "2.13.0";
src = fetchFromGitHub {
owner = "go-skynet";
repo = "LocalAI";
rev = "v${version}";
hash = "sha256-piu2B6u4ZfxiOd9SXrE7jiiiwL2SM8EqXo2s5qeKRl0=";
hash = "sha256-jZE8Ow9FFhnx/jvsURLYlYtSuKpE4UWBezxg/mpHs9g=";
};
self = buildGoModule.override { stdenv = effectiveStdenv; } {
inherit pname version src;
vendorHash = "sha256-8Hu1y/PK21twnB7D22ltslFFzRrsB8d1R2hkgIFB/XY=";
vendorHash = "sha256-nWNK2YekQnBSLx4ouNSe6esIe0yFuo69E0HStYLQANg=";
env.NIX_CFLAGS_COMPILE = lib.optionalString with_stablediffusion " -isystem ${opencv}/include/opencv4";
@ -392,12 +394,12 @@ let
in
''
sed -i Makefile \
-e 's;git clone.*go-llama-ggml$;${cp} ${go-llama-ggml} sources/go-llama-ggml;' \
-e 's;git clone.*go-llama\.cpp$;${cp} ${go-llama} sources/go-llama\.cpp;' \
-e 's;git clone.*gpt4all$;${cp} ${gpt4all} sources/gpt4all;' \
-e 's;git clone.*go-piper$;${cp} ${if with_tts then go-piper else go-piper.src} sources/go-piper;' \
-e 's;git clone.*go-rwkv$;${cp} ${go-rwkv} sources/go-rwkv;' \
-e 's;git clone.*go-rwkv\.cpp$;${cp} ${go-rwkv} sources/go-rwkv\.cpp;' \
-e 's;git clone.*whisper\.cpp$;${cp} ${whisper-cpp.src} sources/whisper\.cpp;' \
-e 's;git clone.*go-bert$;${cp} ${go-bert} sources/go-bert;' \
-e 's;git clone.*go-bert\.cpp$;${cp} ${go-bert} sources/go-bert\.cpp;' \
-e 's;git clone.*diffusion$;${cp} ${if with_stablediffusion then go-stable-diffusion else go-stable-diffusion.src} sources/go-stable-diffusion;' \
-e 's;git clone.*go-tiny-dream$;${cp} ${if with_tinydream then go-tiny-dream else go-tiny-dream.src} sources/go-tiny-dream;' \
-e 's, && git checkout.*,,g' \
@ -415,14 +417,19 @@ let
++ lib.optionals with_stablediffusion go-stable-diffusion.buildInputs
++ lib.optionals with_tts go-piper.buildInputs;
nativeBuildInputs = [ makeWrapper ]
++ lib.optionals with_cublas [ cuda_nvcc ];
nativeBuildInputs = [
protobuf
protoc-gen-go
protoc-gen-go-grpc
makeWrapper
]
++ lib.optionals with_cublas [ cuda_nvcc ];
enableParallelBuilding = false;
modBuildPhase = ''
mkdir sources
make prepare-sources
make prepare-sources protogen-go
go mod tidy -v
'';
@ -486,7 +493,7 @@ let
passthru.local-packages = {
inherit
go-tiny-dream go-rwkv go-bert go-llama-ggml gpt4all go-piper
go-tiny-dream go-rwkv go-bert go-llama gpt4all go-piper
llama-cpp-grpc whisper-cpp go-tiny-dream-ncnn espeak-ng' piper-phonemize
piper-tts';
};
@ -498,6 +505,7 @@ let
};
passthru.tests = callPackages ./tests.nix { inherit self; };
passthru.lib = callPackages ./lib.nix { };
meta = with lib; {
description = "OpenAI alternative to run local LLMs, image and audio generation";

View File

@ -5,156 +5,244 @@
, fetchurl
, writers
, symlinkJoin
, linkFarmFromDrvs
, jq
}:
let
common-config = { config, ... }: {
imports = [ ./module.nix ];
services.local-ai = {
enable = true;
package = self;
threads = config.virtualisation.cores;
};
};
inherit (self.lib) genModels;
in
{
version = testers.testVersion {
package = self;
version = "v" + self.version;
command = "local-ai --help";
};
health =
let
port = "8080";
in
testers.runNixOSTest {
name = self.name + "-health";
nodes.machine = {
systemd.services.local-ai = {
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = "${self}/bin/local-ai --debug --localai-config-dir . --address :${port}";
};
};
testScript = ''
health = testers.runNixOSTest ({ config, ... }: {
name = self.name + "-health";
nodes.machine = common-config;
testScript =
let
port = "8080";
in
''
machine.wait_for_open_port(${port})
machine.succeed("curl -f http://localhost:${port}/readyz")
'';
};
});
# https://localai.io/docs/getting-started/manual/
llama =
# https://localai.io/features/embeddings/#bert-embeddings
bert =
let
port = "8080";
gguf = fetchurl {
url = "https://huggingface.co/TheBloke/Luna-AI-Llama2-Uncensored-GGUF/resolve/main/luna-ai-llama2-uncensored.Q4_K_M.gguf";
sha256 = "6a9dc401c84f0d48996eaa405174999c3a33bf12c2bfd8ea4a1e98f376de1f15";
model = "embedding";
model-configs.${model} = {
# Note: q4_0 and q4_1 models can not be loaded
parameters.model = fetchurl {
url = "https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-f16.bin";
sha256 = "9c195b2453a4fef60a4f6be3a88a39211366214df6498a4fe4885c9e22314f50";
};
backend = "bert-embeddings";
embeddings = true;
};
models = genModels model-configs;
requests.request = {
inherit model;
input = "Your text string goes here";
};
models = linkFarmFromDrvs "models" [
gguf
];
in
testers.runNixOSTest {
name = self.name + "-llama";
nodes.machine =
let
cores = 4;
in
{
virtualisation = {
inherit cores;
memorySize = 8192;
};
systemd.services.local-ai = {
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = "${self}/bin/local-ai --debug --threads ${toString cores} --models-path ${models} --localai-config-dir . --address :${port}";
};
};
name = self.name + "-bert";
nodes.machine = {
imports = [ common-config ];
virtualisation.cores = 2;
virtualisation.memorySize = 2048;
services.local-ai.models = models;
};
passthru = { inherit models requests; };
testScript =
let
# https://localai.io/features/text-generation/#chat-completions
request-chat-completions = {
model = gguf.name;
messages = [{ role = "user"; content = "Say this is a test!"; }];
temperature = 0.7;
};
# https://localai.io/features/text-generation/#edit-completions
request-edit-completions = {
model = gguf.name;
instruction = "rephrase";
input = "Black cat jumped out of the window";
temperature = 0.7;
};
# https://localai.io/features/text-generation/#completions
request-completions = {
model = gguf.name;
prompt = "A long time ago in a galaxy far, far away";
temperature = 0.7;
};
port = "8080";
in
''
machine.wait_for_open_port(${port})
machine.succeed("curl -f http://localhost:${port}/readyz")
machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${gguf.name}\"' models.json")
machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" request-chat-completions} --output chat-completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"chat.completion\"' chat-completions.json")
machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" request-edit-completions} --output edit-completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"edit\"' edit-completions.json")
machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" request-completions} --output completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object ==\"text_completion\"' completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
machine.succeed("curl -f http://localhost:${port}/embeddings --json @${writers.writeJSON "request.json" requests.request} --output embeddings.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .model == \"${model}\"' embeddings.json")
'';
};
} // lib.optionalAttrs self.features.with_tts {
# https://localai.io/features/text-to-audio/#piper
tts =
} // lib.optionalAttrs (!self.features.with_cublas && !self.features.with_clblas) {
# https://localai.io/docs/getting-started/manual/
llama =
let
port = "8080";
voice-en-us = fetchzip {
url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
stripRoot = false;
model = "gpt-3.5-turbo";
# https://localai.io/advanced/#full-config-model-file-reference
model-configs.${model} = rec {
context_size = 8192;
parameters = {
# https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF
# https://ai.meta.com/blog/meta-llama-3/
model = fetchurl {
url = "https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf";
sha256 = "ab9e4eec7e80892fd78f74d9a15d0299f1e22121cea44efd68a7a02a3fe9a1da";
};
# defaults from:
# https://deepinfra.com/meta-llama/Meta-Llama-3-8B-Instruct
temperature = 0.7;
top_p = 0.9;
top_k = 0;
# following parameter leads to outputs like: !!!!!!!!!!!!!!!!!!!
#repeat_penalty = 1;
presence_penalty = 0;
frequency_penalty = 0;
max_tokens = 100;
};
stopwords = [ "<|eot_id|>" ];
template = {
# Templates implement following specifications
# https://github.com/meta-llama/llama3/tree/main?tab=readme-ov-file#instruction-tuned-models
# ... and are insprired by:
# https://github.com/mudler/LocalAI/blob/master/embedded/models/llama3-instruct.yaml
#
# The rules for template evaluateion are defined here:
# https://pkg.go.dev/text/template
chat_message = ''
<|start_header_id|>{{.RoleName}}<|end_header_id|>
{{.Content}}${builtins.head stopwords}'';
chat = "<|begin_of_text|>{{.Input}}<|start_header_id|>assistant<|end_header_id|>";
};
};
ggml-tiny-en = fetchurl {
url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin";
hash = "sha256-x3xXZvHO8JtrfUfyG1Rsvd1BV4hrO11tT3CekeZsfCs=";
};
whisper-en = {
name = "whisper-en";
backend = "whisper";
parameters.model = ggml-tiny-en.name;
};
models = symlinkJoin {
name = "models";
paths = [
voice-en-us
(linkFarmFromDrvs "whisper-en" [
(writers.writeYAML "whisper-en.yaml" whisper-en)
ggml-tiny-en
])
];
models = genModels model-configs;
requests = {
# https://localai.io/features/text-generation/#chat-completions
chat-completions = {
inherit model;
messages = [{ role = "user"; content = "1 + 2 = ?"; }];
};
# https://localai.io/features/text-generation/#edit-completions
edit-completions = {
inherit model;
instruction = "rephrase";
input = "Black cat jumped out of the window";
max_tokens = 50;
};
# https://localai.io/features/text-generation/#completions
completions = {
inherit model;
prompt = "A long time ago in a galaxy far, far away";
};
};
in
testers.runNixOSTest {
name = self.name + "-tts";
nodes.machine =
let
cores = 2;
in
{
virtualisation = {
inherit cores;
};
systemd.services.local-ai = {
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = "${self}/bin/local-ai --debug --threads ${toString cores} --models-path ${models} --localai-config-dir . --address :${port}";
};
};
name = self.name + "-llama";
nodes.machine = {
imports = [ common-config ];
virtualisation.cores = 4;
virtualisation.memorySize = 8192;
services.local-ai.models = models;
};
passthru = { inherit models requests; };
testScript =
let
request = {
model = "en-us-danny-low.onnx";
backend = "piper";
input = "Hello, how are you?";
};
port = "8080";
in
''
machine.wait_for_open_port(${port})
machine.succeed("curl -f http://localhost:${port}/readyz")
machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" request} --output out.wav")
machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${whisper-en.name} --output transcription.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${request.input}\"' transcription.json")
machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" requests.chat-completions} --output chat-completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"chat.completion\"' chat-completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .choices | first.message.content | tonumber == 3' chat-completions.json")
machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" requests.edit-completions} --output edit-completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"edit\"' edit-completions.json")
machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString requests.edit-completions.max_tokens}' edit-completions.json")
machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" requests.completions} --output completions.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .object ==\"text_completion\"' completions.json")
machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString model-configs.${model}.parameters.max_tokens}' completions.json")
'';
};
} // lib.optionalAttrs (self.features.with_tts && !self.features.with_cublas && !self.features.with_clblas) {
# https://localai.io/features/text-to-audio/#piper
tts =
let
model-stt = "whisper-en";
model-configs.${model-stt} = {
backend = "whisper";
parameters.model = fetchurl {
url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin";
hash = "sha256-x3xXZvHO8JtrfUfyG1Rsvd1BV4hrO11tT3CekeZsfCs=";
};
};
model-tts = "piper-en";
model-configs.${model-tts} = {
backend = "piper";
parameters.model = "en-us-danny-low.onnx";
};
models =
let
models = genModels model-configs;
in
symlinkJoin {
inherit (models) name;
paths = [
models
(fetchzip {
url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
stripRoot = false;
})
];
};
requests.request = {
model = model-tts;
input = "Hello, how are you?";
};
in
testers.runNixOSTest {
name = self.name + "-tts";
nodes.machine = {
imports = [ common-config ];
virtualisation.cores = 2;
services.local-ai.models = models;
};
passthru = { inherit models requests; };
testScript =
let
port = "8080";
in
''
machine.wait_for_open_port(${port})
machine.succeed("curl -f http://localhost:${port}/readyz")
machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug' models.json")
machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" requests.request} --output out.wav")
machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${model-stt} --output transcription.json")
machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${requests.request.input}\"' transcription.json")
'';
};
}