koh1018's picture
Add llama-cpp-python wheel via Git LFS
dd11498
raw
history blame
3.13 kB
# app.py (GGUF + llama-cpp-python ๋ฒ„์ „)
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import uvicorn
import json
# 1. FastAPI ์•ฑ ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
app = FastAPI()
# 2. GGUF ๋ชจ๋ธ ๋กœ๋”ฉ ์ค€๋น„
# # TheBloke์˜ SOLAR ๋ชจ๋ธ์„ ์˜ˆ์‹œ๋กœ ์‚ฌ์šฉ.
# # 'repo_id'๋Š” ๋ชจ๋ธ์ด ์žˆ๋Š” ์ €์žฅ์†Œ, 'filename'์€ ๊ทธ ์•ˆ์˜ ํŠน์ • GGUF ํŒŒ์ผ๋ช….
model_repo_id = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF"
model_filename = "solar-10.7b-instruct-v1.0.Q4_K_S.gguf"
# # 7B ๋ชจ๋ธ์ธ Qwen 2.5๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ํ…Œ์ŠคํŠธ
# model_repo_id = "Triangle104/Qwen2.5-7B-Instruct-Q4_K_S-GGUF"
# model_filename = "qwen2.5-7b-instruct-q4_k_s.gguf"
# Hugging Face Hub์—์„œ GGUF ํŒŒ์ผ์„ ๋‹ค์šด๋กœ๋“œํ•˜๊ณ , ๋กœ์ปฌ ๊ฒฝ๋กœ๋ฅผ ๊ฐ€์ ธ์˜จ๋‹ค.
# ์ด ๊ณผ์ •์€ ์„œ๋ฒ„ ์‹œ์ž‘ ์‹œ ํ•œ๋ฒˆ๋งŒ ์‹คํ–‰๋œ๋‹ค.
model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
# llama-cpp-python์„ ์‚ฌ์šฉํ•ด GGUF ๋ชจ๋ธ์„ ๋ฉ”๋ชจ๋ฆฌ์— ๋กœ๋“œํ•œ๋‹ค.
# n_gpu_layers=-1 ์€ GPU๋ฅผ ์ตœ๋Œ€ํ•œ ์‚ฌ์šฉํ•˜๋ผ๋Š” ๋œป. CPU๋งŒ ์“ฐ๋Š” ํ™˜๊ฒฝ์—์„œ๋Š” 0์œผ๋กœ ์„ค์ •.
llm = Llama(
model_path=model_path,
n_ctx=4096, # ๋ชจ๋ธ์ด ํ•œ๋ฒˆ์— ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ๋Š” ์ตœ๋Œ€ ํ† ํฐ ๊ธธ์ด
n_threads=8, # ์‚ฌ์šฉํ•  CPU ์Šค๋ ˆ๋“œ ์ˆ˜
n_gpu_layers=0 # GPU์— ์˜ฌ๋ฆด ๋ ˆ์ด์–ด ์ˆ˜ (-1์€ ๊ฐ€๋Šฅํ•œ ๋ชจ๋‘ ์˜ฌ๋ฆฌ๋ผ๋Š” ๋œป)
)
# ์š”์ฒญ ๋ณธ๋ฌธ ํ˜•์‹์€ ์ด์ „๊ณผ ๋™์ผ
class TranslationRequest(BaseModel):
text: str
# 3. API ์—”๋“œํฌ์ธํŠธ ์ˆ˜์ •
@app.post("/translate")
async def translate_all_in_one(request: TranslationRequest):
korean_text = request.text
# GGUF ๋ชจ๋ธ(Llama-2 Chat ํ˜•์‹)์— ๋งž๋Š” ํ”„๋กฌํ”„ํŠธ ํ˜•์‹
prompt = f"""### User:
You are a helpful translation and pronunciation assistant.
Given the following Korean text, perform three tasks.
1. Translate the text into natural, everyday English.
2. Translate the text into natural, everyday Japanese.
3. Provide the Korean pronunciation (Hangul) for the generated Japanese translation.
Format your response as a single, valid JSON object with the keys "english", "japanese", and "pronunciation".
Korean Text: "{korean_text}"
### Assistant:
"""
# ๋ชจ๋ธ์„ ํ†ตํ•ด ํ…์ŠคํŠธ ์ƒ์„ฑ ์‹คํ–‰
output = llm(
prompt,
max_tokens=512,
stop=["### User:", "</s>"], # ์‘๋‹ต ์ƒ์„ฑ์„ ๋ฉˆ์ถœ ํŠน์ • ๋‹จ์–ด
temperature=0.7,
top_k=50,
echo=False # ํ”„๋กฌํ”„ํŠธ๋ฅผ ๋‹ค์‹œ ์ถœ๋ ฅํ•˜์ง€ ์•Š๋„๋ก ์„ค์ •
)
generated_output = output["choices"][0]["text"].strip()
try:
# GGUF ๋ชจ๋ธ์€ JSON์„ ๊น”๋”ํ•˜๊ฒŒ ์ƒ์„ฑํ•˜๋Š” ๊ฒฝํ–ฅ์ด ์žˆ์Œ
parsed_json = json.loads(generated_output)
return parsed_json
except (json.JSONDecodeError, IndexError) as e:
print(f"JSON ํŒŒ์‹ฑ ์—๋Ÿฌ: {e}")
print(f"๋ชจ๋ธ ์›๋ณธ ์ถœ๋ ฅ: {generated_output}")
return {"error": "Failed to parse model output as JSON", "raw_output": generated_output}
@app.get("/")
def read_root():
return {"message": "GGUF Translation API is running"}