Spaces:

everydayconversify
/

conversify-model

Sleeping

App Files Files Community

conversify-model / app.py

koh1018

Add llama-cpp-python wheel via Git LFS

dd11498 4 months ago

raw

history blame

3.13 kB

	# app.py (GGUF + llama-cpp-python 버전)

	from fastapi import FastAPI
	from pydantic import BaseModel
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import uvicorn
	import json

	# 1. FastAPI 앱 인스턴스 생성
	app = FastAPI()

	# 2. GGUF 모델 로딩 준비
	# # TheBloke의 SOLAR 모델을 예시로 사용.
	# # 'repo_id'는 모델이 있는 저장소, 'filename'은 그 안의 특정 GGUF 파일명.
	model_repo_id = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF"
	model_filename = "solar-10.7b-instruct-v1.0.Q4_K_S.gguf"

	# # 7B 모델인 Qwen 2.5로 변경하여 테스트
	# model_repo_id = "Triangle104/Qwen2.5-7B-Instruct-Q4_K_S-GGUF"
	# model_filename = "qwen2.5-7b-instruct-q4_k_s.gguf"

	# Hugging Face Hub에서 GGUF 파일을 다운로드하고, 로컬 경로를 가져온다.
	# 이 과정은 서버 시작 시 한번만 실행된다.
	model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)

	# llama-cpp-python을 사용해 GGUF 모델을 메모리에 로드한다.
	# n_gpu_layers=-1 은 GPU를 최대한 사용하라는 뜻. CPU만 쓰는 환경에서는 0으로 설정.
	llm = Llama(
	model_path=model_path,
	n_ctx=4096, # 모델이 한번에 처리할 수 있는 최대 토큰 길이
	n_threads=8, # 사용할 CPU 스레드 수
	n_gpu_layers=0 # GPU에 올릴 레이어 수 (-1은 가능한 모두 올리라는 뜻)
	)

	# 요청 본문 형식은 이전과 동일
	class TranslationRequest(BaseModel):
	text: str

	# 3. API 엔드포인트 수정
	@app.post("/translate")
	async def translate_all_in_one(request: TranslationRequest):
	korean_text = request.text

	# GGUF 모델(Llama-2 Chat 형식)에 맞는 프롬프트 형식
	prompt = f"""### User:
	You are a helpful translation and pronunciation assistant.
	Given the following Korean text, perform three tasks.
	1. Translate the text into natural, everyday English.
	2. Translate the text into natural, everyday Japanese.
	3. Provide the Korean pronunciation (Hangul) for the generated Japanese translation.

	Format your response as a single, valid JSON object with the keys "english", "japanese", and "pronunciation".

	Korean Text: "{korean_text}"

	### Assistant:
	"""

	# 모델을 통해 텍스트 생성 실행
	output = llm(
	prompt,
	max_tokens=512,
	stop=["### User:", "</s>"], # 응답 생성을 멈출 특정 단어
	temperature=0.7,
	top_k=50,
	echo=False # 프롬프트를 다시 출력하지 않도록 설정
	)

	generated_output = output["choices"][0]["text"].strip()

	try:
	# GGUF 모델은 JSON을 깔끔하게 생성하는 경향이 있음
	parsed_json = json.loads(generated_output)
	return parsed_json
	except (json.JSONDecodeError, IndexError) as e:
	print(f"JSON 파싱 에러: {e}")
	print(f"모델 원본 출력: {generated_output}")
	return {"error": "Failed to parse model output as JSON", "raw_output": generated_output}

	@app.get("/")
	def read_root():
	return {"message": "GGUF Translation API is running"}