Update app.py
Browse files
app.py
CHANGED
|
@@ -24,254 +24,31 @@ from exception import CustomExceptionHandling
|
|
| 24 |
# Download gguf model files
|
| 25 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
| 26 |
os.makedirs("models",exist_ok=True)
|
| 27 |
-
|
| 28 |
hf_hub_download(
|
| 29 |
repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
|
| 30 |
filename="madlad400-3b-mt-q8_0.gguf",
|
| 31 |
local_dir="./models",
|
| 32 |
)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
Roles.system: PromptMarkers("", "\n"), # System prompt should be included within user message
|
| 38 |
-
Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
|
| 39 |
-
Roles.assistant: PromptMarkers("<start_of_turn>model\n", "<end_of_turn>\n"),
|
| 40 |
-
Roles.tool: PromptMarkers("", ""), # If you need tool support
|
| 41 |
-
}
|
| 42 |
|
| 43 |
-
# Create the formatter
|
| 44 |
-
gemma_3_formatter = MessagesFormatter(
|
| 45 |
-
pre_prompt="", # No pre-prompt
|
| 46 |
-
prompt_markers=gemma_3_prompt_markers,
|
| 47 |
-
include_sys_prompt_in_first_user_message=True, # Include system prompt in first user message
|
| 48 |
-
default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
|
| 49 |
-
strip_prompt=False, # Don't strip whitespace from the prompt
|
| 50 |
-
bos_token="<bos>", # Beginning of sequence token for Gemma 3
|
| 51 |
-
eos_token="<eos>", # End of sequence token for Gemma 3
|
| 52 |
-
)
|
| 53 |
|
| 54 |
-
|
| 55 |
-
# Set the title and description
|
| 56 |
-
title = "Gemma Llama.cpp"
|
| 57 |
-
description = """Gemma 3 is a family of lightweight, multimodal open models that offers advanced capabilities like large context windows and multilingual support, enabling diverse applications on various devices."""
|
| 58 |
|
| 59 |
|
| 60 |
-
llm = None
|
| 61 |
-
llm_model = None
|
| 62 |
-
|
| 63 |
import ctypes
|
| 64 |
import os
|
| 65 |
import multiprocessing
|
| 66 |
|
| 67 |
import llama_cpp
|
| 68 |
|
| 69 |
-
def low_level():
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
llama_cpp.llama_backend_init(numa=False)
|
| 73 |
-
|
| 74 |
-
N_THREADS = multiprocessing.cpu_count()
|
| 75 |
-
MODEL_PATH = "models/madlad400-3b-mt-q8_0.gguf"
|
| 76 |
-
|
| 77 |
-
prompt = b"translate English to German: The house is wonderful."
|
| 78 |
-
|
| 79 |
-
lparams = llama_cpp.llama_model_default_params()
|
| 80 |
-
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
|
| 81 |
-
|
| 82 |
-
vocab = llama_cpp.llama_model_get_vocab(model)
|
| 83 |
-
|
| 84 |
-
cparams = llama_cpp.llama_context_default_params()
|
| 85 |
-
cparams.no_perf = False
|
| 86 |
-
ctx = llama_cpp.llama_init_from_model(model, cparams)
|
| 87 |
-
|
| 88 |
-
sparams = llama_cpp.llama_sampler_chain_default_params()
|
| 89 |
-
smpl = llama_cpp.llama_sampler_chain_init(sparams)
|
| 90 |
-
llama_cpp.llama_sampler_chain_add(smpl, llama_cpp.llama_sampler_init_greedy())
|
| 91 |
-
|
| 92 |
-
n_past = 0
|
| 93 |
-
|
| 94 |
-
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
|
| 95 |
-
|
| 96 |
-
n_of_tok = llama_cpp.llama_tokenize(
|
| 97 |
-
vocab,
|
| 98 |
-
prompt,
|
| 99 |
-
len(prompt),
|
| 100 |
-
embd_inp,
|
| 101 |
-
len(embd_inp),
|
| 102 |
-
True,
|
| 103 |
-
True,
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
embd_inp = embd_inp[:n_of_tok]
|
| 107 |
-
|
| 108 |
-
n_ctx = llama_cpp.llama_n_ctx(ctx)
|
| 109 |
-
|
| 110 |
-
n_predict = 20
|
| 111 |
-
n_predict = min(n_predict, n_ctx - len(embd_inp))
|
| 112 |
-
|
| 113 |
-
input_consumed = 0
|
| 114 |
-
input_noecho = False
|
| 115 |
-
|
| 116 |
-
remaining_tokens = n_predict
|
| 117 |
-
|
| 118 |
-
embd = []
|
| 119 |
-
last_n_size = 64
|
| 120 |
-
last_n_tokens_data = [0] * last_n_size
|
| 121 |
-
n_batch = 24
|
| 122 |
-
last_n_repeat = 64
|
| 123 |
-
repeat_penalty = 1
|
| 124 |
-
frequency_penalty = 0.0
|
| 125 |
-
presence_penalty = 0.0
|
| 126 |
-
|
| 127 |
-
batch = llama_cpp.llama_batch_init(n_batch, 0, 1)
|
| 128 |
-
|
| 129 |
-
# prepare batch for encoding containing the prompt
|
| 130 |
-
batch.n_tokens = len(embd_inp)
|
| 131 |
-
for i in range(batch.n_tokens):
|
| 132 |
-
batch.token[i] = embd_inp[i]
|
| 133 |
-
batch.pos[i] = i
|
| 134 |
-
batch.n_seq_id[i] = 1
|
| 135 |
-
batch.seq_id[i][0] = 0
|
| 136 |
-
batch.logits[i] = False
|
| 137 |
-
|
| 138 |
-
llama_cpp.llama_encode(
|
| 139 |
-
ctx,
|
| 140 |
-
batch
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
# now overwrite embd_inp so batch for decoding will initially contain only
|
| 144 |
-
# a single token with id acquired from llama_model_decoder_start_token(model)
|
| 145 |
-
embd_inp = [llama_cpp.llama_model_decoder_start_token(model)]
|
| 146 |
-
|
| 147 |
-
while remaining_tokens > 0:
|
| 148 |
-
if len(embd) > 0:
|
| 149 |
-
|
| 150 |
-
batch.n_tokens = len(embd)
|
| 151 |
-
for i in range(batch.n_tokens):
|
| 152 |
-
batch.token[i] = embd[i]
|
| 153 |
-
batch.pos[i] = n_past + i
|
| 154 |
-
batch.n_seq_id[i] = 1
|
| 155 |
-
batch.seq_id[i][0] = 0
|
| 156 |
-
batch.logits[i] = i == batch.n_tokens - 1
|
| 157 |
-
|
| 158 |
-
llama_cpp.llama_decode(
|
| 159 |
-
ctx,
|
| 160 |
-
batch
|
| 161 |
-
)
|
| 162 |
-
|
| 163 |
-
n_past += len(embd)
|
| 164 |
-
embd = []
|
| 165 |
-
if len(embd_inp) <= input_consumed:
|
| 166 |
-
id = llama_cpp.llama_sampler_sample(smpl, ctx, -1)
|
| 167 |
-
|
| 168 |
-
last_n_tokens_data = last_n_tokens_data[1:] + [id]
|
| 169 |
-
embd.append(id)
|
| 170 |
-
input_noecho = False
|
| 171 |
-
remaining_tokens -= 1
|
| 172 |
-
else:
|
| 173 |
-
while len(embd_inp) > input_consumed:
|
| 174 |
-
embd.append(embd_inp[input_consumed])
|
| 175 |
-
last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
|
| 176 |
-
input_consumed += 1
|
| 177 |
-
if len(embd) >= n_batch:
|
| 178 |
-
break
|
| 179 |
-
if not input_noecho:
|
| 180 |
-
for id in embd:
|
| 181 |
-
size = 32
|
| 182 |
-
buffer = (ctypes.c_char * size)()
|
| 183 |
-
n = llama_cpp.llama_token_to_piece(
|
| 184 |
-
vocab, llama_cpp.llama_token(id), buffer, size, 0, True
|
| 185 |
-
)
|
| 186 |
-
assert n <= size
|
| 187 |
-
print(
|
| 188 |
-
buffer[:n].decode("utf-8"),
|
| 189 |
-
end="",
|
| 190 |
-
flush=True,
|
| 191 |
-
)
|
| 192 |
-
|
| 193 |
-
if len(embd) > 0 and embd[-1] in [llama_cpp.llama_token_eos(vocab), llama_cpp.llama_token_eot(vocab)]:
|
| 194 |
-
break
|
| 195 |
-
|
| 196 |
-
print()
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
def trans(text):
|
| 200 |
-
#test()
|
| 201 |
-
llama = Llama("models/madlad400-3b-mt-q8_0.gguf")
|
| 202 |
-
tokens = llama.tokenize(b"translate English to German: The house is wonderful.")
|
| 203 |
-
llama.encode(tokens)
|
| 204 |
-
tokens = [llama.decoder_start_token()]
|
| 205 |
-
for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
|
| 206 |
-
print(llama.detokenize([token]))
|
| 207 |
-
if token == llama.token_eos():
|
| 208 |
-
break
|
| 209 |
-
|
| 210 |
-
return None
|
| 211 |
-
|
| 212 |
-
# テキストに言語タグを付与し、バイト列に変換
|
| 213 |
-
input_text = f"<2ja>{text}"
|
| 214 |
-
|
| 215 |
-
# トークナイズ
|
| 216 |
-
tokens = llm.tokenize(input_text)
|
| 217 |
-
print("Tokens:", tokens)
|
| 218 |
-
|
| 219 |
-
# BOSトークンを取得し、確認
|
| 220 |
-
bos_token = llm.token_bos()
|
| 221 |
-
print("BOS Token:", bos_token)
|
| 222 |
-
initial_tokens = [bos_token]
|
| 223 |
-
initial_tokens = [1]
|
| 224 |
-
print("Initial Tokens:", initial_tokens)
|
| 225 |
-
|
| 226 |
-
# 生成
|
| 227 |
-
buf = ""
|
| 228 |
-
for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
|
| 229 |
-
decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
|
| 230 |
-
buf += decoded
|
| 231 |
-
if token == llm.token_eos():
|
| 232 |
-
break
|
| 233 |
-
|
| 234 |
-
return buf
|
| 235 |
-
|
| 236 |
-
# テキストに言語タグを付与し、バイト列に変換
|
| 237 |
-
input_text = f"<2ja>{text}".encode('utf-8')
|
| 238 |
-
|
| 239 |
-
# トークナイズ
|
| 240 |
-
tokens = llm.tokenize(input_text)
|
| 241 |
-
print("Tokens:", tokens)
|
| 242 |
-
|
| 243 |
-
# BOSトークンを使用(デコーダーのみのモデルを想定)
|
| 244 |
-
initial_tokens = [llm.token_bos()]
|
| 245 |
-
|
| 246 |
-
# 生成
|
| 247 |
-
buf = ""
|
| 248 |
-
for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
|
| 249 |
-
decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
|
| 250 |
-
buf += decoded
|
| 251 |
-
if token == llm.token_eos():
|
| 252 |
-
break
|
| 253 |
-
|
| 254 |
-
return buf
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
input_text = f"<2ja>{text}".encode('utf-8')
|
| 258 |
-
tokens = llm.tokenize(input_text)
|
| 259 |
-
print("Tokens:", tokens)
|
| 260 |
-
initial_tokens = [llm.decoder_start_token()]
|
| 261 |
-
print("Initial Tokens:", initial_tokens)
|
| 262 |
-
return text
|
| 263 |
-
llama = llm
|
| 264 |
-
text = f"<2ja>{text}".encode()
|
| 265 |
-
tokens = llama.tokenize(text)
|
| 266 |
-
llama.encode(tokens)
|
| 267 |
-
tokens = [llama.decoder_start_token()]
|
| 268 |
-
buf = ""
|
| 269 |
-
for token in llama.generate(tokens, top_k=0, top_p=0.95, temp=0, repeat_penalty=1.0):
|
| 270 |
-
buf += llama.detokenize([token]).decode()
|
| 271 |
-
if token == llama.token_eos():
|
| 272 |
-
break
|
| 273 |
-
return buf
|
| 274 |
-
|
| 275 |
def respond(
|
| 276 |
message: str,
|
| 277 |
history: List[Tuple[str, str]],
|
|
@@ -283,24 +60,6 @@ def respond(
|
|
| 283 |
top_k: int,
|
| 284 |
repeat_penalty: float,
|
| 285 |
):
|
| 286 |
-
llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
|
| 287 |
-
n_gpu_layers=0,
|
| 288 |
-
n_batch=16,
|
| 289 |
-
n_ctx=512,
|
| 290 |
-
n_threads=2,
|
| 291 |
-
n_threads_batch=8,)
|
| 292 |
-
#tokens = llama.tokenize(f"<2ja>{message}")#
|
| 293 |
-
tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
|
| 294 |
-
llama.encode(tokens)
|
| 295 |
-
tokens = [llama.decoder_start_token()]
|
| 296 |
-
outputs =""
|
| 297 |
-
for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
|
| 298 |
-
outputs+= llama.detokenize([token]).decode()
|
| 299 |
-
yield outputs
|
| 300 |
-
if token == llama.token_eos():
|
| 301 |
-
break
|
| 302 |
-
return outputs
|
| 303 |
-
|
| 304 |
"""
|
| 305 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
| 306 |
|
|
@@ -319,79 +78,35 @@ def respond(
|
|
| 319 |
str: The response to the message.
|
| 320 |
"""
|
| 321 |
try:
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
#yield "done"
|
| 342 |
-
|
| 343 |
-
provider = LlamaCppPythonProvider(llm)
|
| 344 |
-
|
| 345 |
-
# Create the agent
|
| 346 |
-
agent = LlamaCppAgent(
|
| 347 |
-
provider,
|
| 348 |
-
system_prompt=f"{system_message}",
|
| 349 |
-
# predefined_messages_formatter_type=GEMMA_2,
|
| 350 |
-
custom_messages_formatter=gemma_3_formatter,
|
| 351 |
-
debug_output=True,
|
| 352 |
-
)
|
| 353 |
-
|
| 354 |
-
# Set the settings like temperature, top-k, top-p, max tokens, etc.
|
| 355 |
-
settings = provider.get_provider_default_settings()
|
| 356 |
-
settings.temperature = temperature
|
| 357 |
-
settings.top_k = top_k
|
| 358 |
-
settings.top_p = top_p
|
| 359 |
-
settings.max_tokens = max_tokens
|
| 360 |
-
settings.repeat_penalty = repeat_penalty
|
| 361 |
-
settings.stream = True
|
| 362 |
-
|
| 363 |
-
messages = BasicChatHistory()
|
| 364 |
-
|
| 365 |
-
# Add the chat history
|
| 366 |
-
for msn in history:
|
| 367 |
-
user = {"role": Roles.user, "content": msn[0]}
|
| 368 |
-
assistant = {"role": Roles.assistant, "content": msn[1]}
|
| 369 |
-
messages.add_message(user)
|
| 370 |
-
messages.add_message(assistant)
|
| 371 |
-
|
| 372 |
-
# Get the response stream
|
| 373 |
-
stream = agent.get_chat_response(
|
| 374 |
-
message,
|
| 375 |
-
llm_sampling_settings=settings,
|
| 376 |
-
chat_history=messages,
|
| 377 |
-
returns_streaming_generator=True,
|
| 378 |
-
print_output=False,
|
| 379 |
-
)
|
| 380 |
-
|
| 381 |
-
# Log the success
|
| 382 |
-
logging.info("Response stream generated successfully")
|
| 383 |
-
|
| 384 |
-
# Generate the response
|
| 385 |
-
outputs = ""
|
| 386 |
-
for output in stream:
|
| 387 |
-
outputs += output
|
| 388 |
-
#yield outputs
|
| 389 |
-
|
| 390 |
-
# Handle exceptions that may occur during the process
|
| 391 |
except Exception as e:
|
| 392 |
# Custom exception handling
|
| 393 |
raise CustomExceptionHandling(e, sys) from e
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
# Create a chat interface
|
| 397 |
demo = gr.ChatInterface(
|
|
@@ -413,7 +128,7 @@ demo = gr.ChatInterface(
|
|
| 413 |
value="You are a helpful assistant.",
|
| 414 |
label="System Prompt",
|
| 415 |
info="Define the AI assistant's personality and behavior",
|
| 416 |
-
lines=2,
|
| 417 |
),
|
| 418 |
gr.Slider(
|
| 419 |
minimum=512,
|
|
|
|
| 24 |
# Download gguf model files
|
| 25 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
| 26 |
os.makedirs("models",exist_ok=True)
|
| 27 |
+
|
| 28 |
hf_hub_download(
|
| 29 |
repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
|
| 30 |
filename="madlad400-3b-mt-q8_0.gguf",
|
| 31 |
local_dir="./models",
|
| 32 |
)
|
| 33 |
|
| 34 |
+
# Set the title and description
|
| 35 |
+
title = "madlad400-3b-mt Llama.cpp"
|
| 36 |
+
description = """
|
| 37 |
+
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5) I'm not sure current llama-cpp-python support t5
|
| 38 |
|
| 39 |
+
[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF) [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp) [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
|
| 40 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
llama = None
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
import ctypes
|
| 47 |
import os
|
| 48 |
import multiprocessing
|
| 49 |
|
| 50 |
import llama_cpp
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def respond(
|
| 53 |
message: str,
|
| 54 |
history: List[Tuple[str, str]],
|
|
|
|
| 60 |
top_k: int,
|
| 61 |
repeat_penalty: float,
|
| 62 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
"""
|
| 64 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
| 65 |
|
|
|
|
| 78 |
str: The response to the message.
|
| 79 |
"""
|
| 80 |
try:
|
| 81 |
+
global llama
|
| 82 |
+
if llama == None:
|
| 83 |
+
llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
|
| 84 |
+
n_gpu_layers=0,
|
| 85 |
+
n_batch=32,
|
| 86 |
+
n_ctx=512,
|
| 87 |
+
n_threads=2,
|
| 88 |
+
n_threads_batch=16)
|
| 89 |
+
|
| 90 |
+
tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
|
| 91 |
+
llama.encode(tokens)
|
| 92 |
+
tokens = [llama.decoder_start_token()]
|
| 93 |
+
outputs =""
|
| 94 |
+
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
|
| 95 |
+
outputs+= llama.detokenize([token]).decode()
|
| 96 |
+
yield outputs
|
| 97 |
+
if token == llama.token_eos():
|
| 98 |
+
break
|
| 99 |
+
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
# Custom exception handling
|
| 102 |
raise CustomExceptionHandling(e, sys) from e
|
| 103 |
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
|
| 111 |
# Create a chat interface
|
| 112 |
demo = gr.ChatInterface(
|
|
|
|
| 128 |
value="You are a helpful assistant.",
|
| 129 |
label="System Prompt",
|
| 130 |
info="Define the AI assistant's personality and behavior",
|
| 131 |
+
lines=2,visible=False
|
| 132 |
),
|
| 133 |
gr.Slider(
|
| 134 |
minimum=512,
|