Spaces:

everydayconversify
/

conversify-model

Sleeping

File size: 3,130 Bytes

43bf256
 
ee22dcb
 
43bf256
 
ee22dcb
43bf256
ee22dcb
ab57525
ee22dcb
 
43bf256
fc1a2fa
 
dd11498
 
fc1a2fa
dd11498
 
 
43bf256
 
 
 
 
 
 
 
 
 
 
 
ee22dcb
 
43bf256
ee22dcb
43bf256
06f14cd
43bf256
ee22dcb
43bf256
 
ee22dcb
43bf256
 
 
 
 
 
 
ee22dcb
43bf256
ee22dcb
43bf256
06f14cd
43bf256
 
06f14cd
43bf256
 
 
 
 
 
 
 
06f14cd
 
43bf256
 
 
 
 
 
 
 
 
 
06f14cd
ee22dcb
 
43bf256

# app.py (GGUF + llama-cpp-python 버전)

from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import uvicorn
import json

# 1. FastAPI 앱 인스턴스 생성
app = FastAPI()

# 2. GGUF 모델 로딩 준비
# #    TheBloke의 SOLAR 모델을 예시로 사용.
# #    'repo_id'는 모델이 있는 저장소, 'filename'은 그 안의 특정 GGUF 파일명.
model_repo_id = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF"
model_filename = "solar-10.7b-instruct-v1.0.Q4_K_S.gguf"

# #    7B 모델인 Qwen 2.5로 변경하여 테스트
# model_repo_id = "Triangle104/Qwen2.5-7B-Instruct-Q4_K_S-GGUF"
# model_filename = "qwen2.5-7b-instruct-q4_k_s.gguf"

# Hugging Face Hub에서 GGUF 파일을 다운로드하고, 로컬 경로를 가져온다.
# 이 과정은 서버 시작 시 한번만 실행된다.
model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)

# llama-cpp-python을 사용해 GGUF 모델을 메모리에 로드한다.
# n_gpu_layers=-1 은 GPU를 최대한 사용하라는 뜻. CPU만 쓰는 환경에서는 0으로 설정.
llm = Llama(
  model_path=model_path,
  n_ctx=4096, # 모델이 한번에 처리할 수 있는 최대 토큰 길이
  n_threads=8, # 사용할 CPU 스레드 수
  n_gpu_layers=0 # GPU에 올릴 레이어 수 (-1은 가능한 모두 올리라는 뜻)
)

# 요청 본문 형식은 이전과 동일
class TranslationRequest(BaseModel):
    text: str

# 3. API 엔드포인트 수정
@app.post("/translate")
async def translate_all_in_one(request: TranslationRequest):
    korean_text = request.text

    # GGUF 모델(Llama-2 Chat 형식)에 맞는 프롬프트 형식
    prompt = f"""### User:
        You are a helpful translation and pronunciation assistant.
        Given the following Korean text, perform three tasks.
        1. Translate the text into natural, everyday English.
        2. Translate the text into natural, everyday Japanese.
        3. Provide the Korean pronunciation (Hangul) for the generated Japanese translation.

        Format your response as a single, valid JSON object with the keys "english", "japanese", and "pronunciation".

        Korean Text: "{korean_text}"

        ### Assistant:
        """
    
    # 모델을 통해 텍스트 생성 실행
    output = llm(
      prompt,
      max_tokens=512,
      stop=["### User:", "</s>"], # 응답 생성을 멈출 특정 단어
      temperature=0.7,
      top_k=50,
      echo=False # 프롬프트를 다시 출력하지 않도록 설정
    )

    generated_output = output["choices"][0]["text"].strip()
    
    try:
        # GGUF 모델은 JSON을 깔끔하게 생성하는 경향이 있음
        parsed_json = json.loads(generated_output)
        return parsed_json
    except (json.JSONDecodeError, IndexError) as e:
        print(f"JSON 파싱 에러: {e}")
        print(f"모델 원본 출력: {generated_output}")
        return {"error": "Failed to parse model output as JSON", "raw_output": generated_output}

@app.get("/")
def read_root():
    return {"message": "GGUF Translation API is running"}