Spaces:

everydayconversify
/

conversify-model

Sleeping

App Files Files Community

koh1018 commited on Jul 5

Commit

43bf256

1 Parent(s): effaff8

모델 양자화 및 교체, GGUF 사용으로 코드 전체 교체

Browse files

Files changed (2) hide show

app.py +57 -79
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -1,101 +1,79 @@
-import torch
 from fastapi import FastAPI
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import uvicorn
 # 1. FastAPI 앱 인스턴스 생성
 app = FastAPI()
-# 2. AI 모델과 토크나이저를 앱 시작 시 한번만 로딩 (매우 중요)
-MODEL_ID = "Qwen/Qwen1.5-1.8B-Chat"
-# 메모리 효율을 위해 bfloat16 사용하고, accelerate 라이브러리로 하드웨어 자동 할당
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
-# Transformers 라이브러리의 pipeline을 사용하면 코드가 더 간결해짐
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
 )
-# 3. 요청 본문(Request Body)의 데이터 형식을 지정
-#    프론트엔드에서 "text"라는 키에 번역할 문장을 담아서 보내야 한다는 규칙.
 class TranslationRequest(BaseModel):
-    korean_text: str
-    target_lang: str # 'en' 또는 'ja'
-# 발음 변환 API를 위한 주문서 양식
-class PronunciationRequest(BaseModel):
-    japanese_text: str
-# 4. "/translate" API 엔드포인트 생성
 @app.post("/translate")
-async def translate(request: TranslationRequest):
-    korean_text = request.korean_text
-    target_lang = request.target_lang
-    # 프롬프트 엔지니어링: 모델에게 원하는 결과물을 명확하게 지시
-    if target_lang == 'en':
-        prompt = f"Translate the following Korean sentence into natural, everyday English. Provide only the translated sentence, without any additional explanations or quotation marks.\n\nKorean: \"{korean_text}\"\n\nEnglish:"
-    elif target_lang == 'ja':
-        prompt = f"Translate the following Korean sentence into natural, everyday Japanese. Provide only the translated sentence, without any additional explanations or quotation marks.\n\nKorean: \"{korean_text}\"\n\nJapanese:"
-    else:
-        return {"error": "Invalid target language"}
-    # Gemma/Qwen 같은 챗봇 모델을 위한 대화 형식
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    # 파이프라인으로 텍스트 생성 실행
-    outputs = pipe(
-        messages,
-        max_new_tokens=150,
-        do_sample=True,
-        temperature=0.7,
-        top_k=50,
-        return_full_text=False,
-    )
-    # 결과물에서 필요한 부분만 추출
-    translated_text = outputs[0]["generated_text"].strip()
-    # 프론트엔드에 번역된 텍스트를 JSON 형태로 반환
-    return {"translated_text": translated_text}
-# 5. "/pronunciation" API 엔드포인트 생성
-@app.post("/pronunciation")
-async def get_pronunciation(request: PronunciationRequest):
-    japanese_text = request.japanese_text
-    # 발음 변환을 위한 새로운 프롬프트
-    prompt = f"Provide the Korean pronunciation (Hangul) for the following Japanese sentence. Provide only the Hangul pronunciation, without any other text, labels, or quotation marks.\n\nJapanese: \"{japanese_text}\"\n\nKorean Pronunciation:"
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    outputs = pipe(
-        messages,
-        max_new_tokens=150,
-        do_sample=True,
-        temperature=0.7,
-        top_k=50,
-        return_full_text=False,
     )
-    pronunciation_text = outputs[0]["generated_text"].strip()
-    return {"pronunciation_text": pronunciation_text}
-# 서버가 잘 작동하는지 확인하기 위한 기본 주소
 @app.get("/")
 def read_root():
-    return {"message": "Translation API is running"}

+# app.py (GGUF + llama-cpp-python 버전)
 from fastapi import FastAPI
 from pydantic import BaseModel
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 import uvicorn
+import json
 # 1. FastAPI 앱 인스턴스 생성
 app = FastAPI()
+# 2. GGUF 모델 로딩 준비
+#    TheBloke의 SOLAR 모델을 예시로 사용.
+#    'repo_id'는 모델이 있는 저장소, 'filename'은 그 안의 특정 GGUF 파일명.
+model_repo_id = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF"
+model_filename = "solar-10.7b-instruct-v1.0.Q4_K_S.gguf"
+# Hugging Face Hub에서 GGUF 파일을 다운로드하고, 로컬 경로를 가져온다.
+# 이 과정은 서버 시작 시 한번만 실행된다.
+model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
+# llama-cpp-python을 사용해 GGUF 모델을 메모리에 로드한다.
+# n_gpu_layers=-1 은 GPU를 최대한 사용하라는 뜻. CPU만 쓰는 환경에서는 0으로 설정.
+llm = Llama(
+  model_path=model_path,
+  n_ctx=4096, # 모델이 한번에 처리할 수 있는 최대 토큰 길이
+  n_threads=8, # 사용할 CPU 스레드 수
+  n_gpu_layers=0 # GPU에 올릴 레이어 수 (-1은 가능한 모두 올리라는 뜻)
 )
+# 요청 본문 형식은 이전과 동일
 class TranslationRequest(BaseModel):
+    text: str
+# 3. API 엔드포인트 수정
 @app.post("/translate")
+async def translate_all_in_one(request: TranslationRequest):
+    korean_text = request.text
+    # GGUF 모델(Llama-2 Chat 형식)에 맞는 프롬프트 형식
+    prompt = f"""### User:
+        You are a helpful translation and pronunciation assistant.
+        Given the following Korean text, perform three tasks.
+        1. Translate the text into natural, everyday English.
+        2. Translate the text into natural, everyday Japanese.
+        3. Provide the Korean pronunciation (Hangul) for the generated Japanese translation.
+        Format your response as a single, valid JSON object with the keys "english", "japanese", and "pronunciation".
+        Korean Text: "{korean_text}"
+        ### Assistant:
+        """
+    # 모델을 통해 텍스트 생성 실행
+    output = llm(
+      prompt,
+      max_tokens=512,
+      stop=["### User:", "</s>"], # 응답 생성을 멈출 특정 단어
+      temperature=0.7,
+      top_k=50,
+      echo=False # 프롬프트를 다시 출력하지 않도록 설정
     )
+    generated_output = output["choices"][0]["text"].strip()
+    try:
+        # GGUF 모델은 JSON을 깔끔하게 생성하는 경향이 있음
+        parsed_json = json.loads(generated_output)
+        return parsed_json
+    except (json.JSONDecodeError, IndexError) as e:
+        print(f"JSON 파싱 에러: {e}")
+        print(f"모델 원본 출력: {generated_output}")
+        return {"error": "Failed to parse model output as JSON", "raw_output": generated_output}
 @app.get("/")
 def read_root():
+    return {"message": "GGUF Translation API is running"}

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ fastapi
 uvicorn
 pydantic
 python-multipart
-transformers>=4.42.0
-torch>=2.3.0
 sentencepiece
-accelerate

 uvicorn
 pydantic
 python-multipart
+torch
 sentencepiece
+llama-cpp-python
+huggingface-hub