Spaces:

alexkueck
/

TestInferenceAPI

Sleeping

App Files Files Community

alexkueck commited on Dec 17, 2023

Commit

dbcd7ed

1 Parent(s): 28a1b8b

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -14

app.py CHANGED Viewed

@@ -74,14 +74,16 @@ splittet = False
 print ("Inf.Client")
 #client = InferenceClient("https://api-inference.huggingface.co/models/meta-llama/Llama-2-70b-chat-hf")
 #client = InferenceClient("https://ybdhvwle4ksrawzo.eu-west-1.aws.endpoints.huggingface.cloud")
-client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta")
 ##############################################
 # tokenizer for generating prompt
 ##############################################
 print ("Tokenizer")
 #tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf")
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
 #tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
 #tokenizer =  AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
@@ -279,22 +281,21 @@ def generate(text, history, rag_option, model_option,  temperature=0.5, max_new_
         #Anfrage an Modell (mit RAG: mit chunks aus Vektorstore, ohne: nur promt und history)
         #payload = tokenizer.apply_chat_template([{"role":"user","content":prompt}],tokenize=False)
-        payload = tokenizer.apply_chat_template(prompt,tokenize=False)
-        result = client.text_generation(
-                    payload,
-                    do_sample=True,
-                    return_full_text=False,
-                    max_new_tokens=2048,
-                    top_p=0.9,
-                    temperature=0.6,
-              )
     except Exception as e:
         raise gr.Error(e)
     #Antwort als Stream ausgeben...
-    for i in range(len(result)):
-        time.sleep(0.05)
-        yield  result[: i+1]

 print ("Inf.Client")
 #client = InferenceClient("https://api-inference.huggingface.co/models/meta-llama/Llama-2-70b-chat-hf")
 #client = InferenceClient("https://ybdhvwle4ksrawzo.eu-west-1.aws.endpoints.huggingface.cloud")
+#Inference mit Authorisation:
+API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
+HEADERS = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
 ##############################################
 # tokenizer for generating prompt
 ##############################################
 print ("Tokenizer")
 #tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf")
+#tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
 #tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
 #tokenizer =  AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
         #Anfrage an Modell (mit RAG: mit chunks aus Vektorstore, ohne: nur promt und history)
         #payload = tokenizer.apply_chat_template([{"role":"user","content":prompt}],tokenize=False)
+        #Für LLAMA:
+        #payload = tokenizer.apply_chat_template(prompt,tokenize=False)
+        #result = client.text_generation(payload, do_sample=True,return_full_text=False, max_new_tokens=2048,top_p=0.9,temperature=0.6,)
+        #inference allg:
+        result= requests.post(API_URL, headers=HEADERS, json=prompt)
     except Exception as e:
         raise gr.Error(e)
+    return result.json()
     #Antwort als Stream ausgeben...
+    #for i in range(len(result)):
+        #time.sleep(0.05)
+        #yield  result[: i+1]