Spaces:
Runtime error
Runtime error
| import time | |
| import json | |
| import requests | |
| import gradio as gr | |
| STYLE = """ | |
| .no-border { | |
| border: none !important; | |
| } | |
| .group-border { | |
| padding: 10px; | |
| border-width: 1px; | |
| border-radius: 10px; | |
| border-color: gray; | |
| border-style: solid; | |
| box-shadow: 1px 1px 3px; | |
| } | |
| .control-label-font { | |
| font-size: 13pt !important; | |
| } | |
| .control-button { | |
| background: none !important; | |
| border-color: #69ade2 !important; | |
| border-width: 2px !important; | |
| color: #69ade2 !important; | |
| } | |
| .center { | |
| text-align: center; | |
| } | |
| .right { | |
| text-align: right; | |
| } | |
| .no-label { | |
| padding: 0px !important; | |
| } | |
| .no-label > label > span { | |
| display: none; | |
| } | |
| .small-big { | |
| font-size: 12pt !important; | |
| } | |
| """ | |
| def avaliable_providers(): | |
| providers = [] | |
| headers = { | |
| "Content-Type": "application/json", | |
| } | |
| endpoint_url = "https://api.endpoints.huggingface.cloud/v2/provider" | |
| response = requests.get(endpoint_url, headers=headers) | |
| providers = {} | |
| for provider in response.json()['vendors']: | |
| if provider['status'] == 'available': | |
| regions = {} | |
| availability = False | |
| for region in provider['regions']: | |
| if region["status"] == "available": | |
| regions[region['name']] = { | |
| "label": region['label'], | |
| "computes": region['computes'] | |
| } | |
| availability = True | |
| if availability: | |
| providers[provider['name']] = regions | |
| return providers | |
| providers = avaliable_providers() | |
| def update_regions(provider): | |
| avalialbe_regions = [] | |
| regions = providers[provider] | |
| for region, attributes in regions.items(): | |
| avalialbe_regions.append(f"{region}[{attributes['label']}]") | |
| return gr.Dropdown.update( | |
| choices=avalialbe_regions, | |
| value=avalialbe_regions[0] if len(avalialbe_regions) > 0 else None | |
| ) | |
| def update_compute_options(provider, region): | |
| avalialbe_compute_options = [] | |
| computes = providers[provider][region.split("[")[0].strip()]["computes"] | |
| for compute in computes: | |
| if compute['status'] == 'available': | |
| accelerator = compute['accelerator'] | |
| numAccelerators = compute['numAccelerators'] | |
| memoryGb = compute['memoryGb'] | |
| architecture = compute['architecture'] | |
| instanceType = compute['instanceType'] | |
| pricePerHour = compute['pricePerHour'] | |
| type = f"{numAccelerators}vCPU {memoryGb} 路 {architecture}" if accelerator == "cpu" else f"{numAccelerators}x {architecture}" | |
| avalialbe_compute_options.append( | |
| f"{compute['accelerator'].upper()} [{compute['instanceSize']}] 路 {type} 路 {instanceType} 路 ${pricePerHour}/hour" | |
| ) | |
| return gr.Dropdown.update( | |
| choices=avalialbe_compute_options, | |
| value=avalialbe_compute_options[0] if len(avalialbe_compute_options) > 0 else None | |
| ) | |
| def submit( | |
| hf_account_input, | |
| hf_token_input, | |
| endpoint_name_input, | |
| provider_selector, | |
| region_selector, | |
| repository_selector, | |
| task_selector, | |
| framework_selector, | |
| compute_selector, | |
| min_node_selector, | |
| max_node_selector, | |
| security_selector, | |
| custom_kernel, | |
| max_input_length, | |
| max_tokens, | |
| max_batch_prefill_token, | |
| max_batch_total_token | |
| ): | |
| compute_resources = compute_selector.split("路") | |
| accelerator = compute_resources[0][:3].strip() | |
| size_l_index = compute_resources[0].index("[") - 1 | |
| size_r_index = compute_resources[0].index("]") | |
| size = compute_resources[0][size_l_index : size_r_index].strip() | |
| type = compute_resources[-2].strip() | |
| payload = { | |
| "accountId": hf_account_input.strip(), | |
| "compute": { | |
| "accelerator": accelerator.lower(), | |
| "instanceSize": size[1:], | |
| "instanceType": type, | |
| "scaling": { | |
| "maxReplica": int(max_node_selector), | |
| "minReplica": int(min_node_selector) | |
| } | |
| }, | |
| "model": { | |
| "framework": framework_selector.lower(), | |
| "image": { | |
| "custom": { | |
| "health_route": "/health", | |
| "env": { | |
| "DISABLE_CUSTOM_KERNELS": "true" if custom_kernel == "Enabled" else "false", | |
| "MAX_BATCH_PREFILL_TOKENS": str(max_batch_prefill_token), | |
| "MAX_BATCH_TOTAL_TOKENS": str(max_batch_total_token), | |
| "MAX_INPUT_LENGTH": str(max_input_length), | |
| "MAX_TOTAL_TOKENS": str(max_tokens), | |
| "MODEL_ID": repository_selector.lower(), | |
| # QUANTIZE: 'bitsandbytes' | 'gptq'; | |
| }, | |
| "url": "ghcr.io/huggingface/text-generation-inference:1.0.1", | |
| } | |
| }, | |
| "repository": repository_selector.lower(), | |
| # "revision": "main", | |
| "task": task_selector.lower() | |
| }, | |
| "name": endpoint_name_input.strip().lower(), | |
| "provider": { | |
| "region": region_selector.split("[")[0].lower(), | |
| "vendor": provider_selector.lower() | |
| }, | |
| "type": security_selector.lower() | |
| } | |
| print(payload) | |
| payload = json.dumps(payload) | |
| print(payload) | |
| headers = { | |
| "Authorization": f"Bearer {hf_token_input.strip()}", | |
| "Content-Type": "application/json", | |
| } | |
| endpoint_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint/"#{hf_account_input.strip()}" | |
| print(endpoint_url) | |
| response = requests.post(endpoint_url, headers=headers, data=payload) | |
| if response.status_code == 400: | |
| return f"{response.text}. Malformed data in {payload}" | |
| elif response.status_code == 401: | |
| return "Invalid token" | |
| elif response.status_code == 409: | |
| return f"Endpoint {endpoint_name_input} already exists" | |
| elif response.status_code == 202: | |
| return f"Endpoint {endpoint_name_input} created successfully on {provider_selector.lower()} using {repository_selector.lower()}@main.\nPlease check out the progress at https://ui.endpoints.huggingface.co/endpoints." | |
| else: | |
| return f"something went wrong {response.status_code} = {response.text}" | |
| with gr.Blocks(css=STYLE) as hf_endpoint: | |
| with gr.Tab("Hugging Face", elem_classes=["no-border"]): | |
| gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"]) | |
| with gr.Column(elem_classes=["group-border"]): | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Hugging Face account ID (name)""") | |
| hf_account_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"]) | |
| with gr.Column(): | |
| gr.Markdown("### Hugging Face access token") | |
| hf_token_input = gr.Textbox(show_label=False, type="password", elem_classes=["no-label", "small-big"]) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Target model | |
| Model from the Hugging Face hub""") | |
| repository_selector = gr.Textbox( | |
| value="NousResearch/Nous-Hermes-Llama2-13b", | |
| interactive=False, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Target model version(branch) | |
| Branch name of the Model""") | |
| revision_selector = gr.Textbox( | |
| value=f"main", | |
| interactive=False, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(elem_classes=["group-border"]): | |
| with gr.Column(): | |
| gr.Markdown("""### Endpoint name | |
| Name for your new endpoint""") | |
| endpoint_name_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"]) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Cloud Provider""") | |
| provider_selector = gr.Dropdown( | |
| choices=providers.keys(), | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Cloud Region""") | |
| region_selector = gr.Dropdown( | |
| [], | |
| value="", | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Row(visible=False): | |
| with gr.Column(): | |
| gr.Markdown("### Task") | |
| task_selector = gr.Textbox( | |
| value="text-generation", | |
| interactive=False, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### Framework") | |
| framework_selector = gr.Textbox( | |
| value="PyTorch", | |
| interactive=False, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Compute Instance Type""") | |
| compute_selector = gr.Dropdown( | |
| [], | |
| value="", | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Min Number of Nodes""") | |
| min_node_selector = gr.Number( | |
| value=1, | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Max Number of Nodes""") | |
| max_node_selector = gr.Number( | |
| value=1, | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Security Level""") | |
| security_selector = gr.Radio( | |
| choices=["Protected", "Public", "Private"], | |
| value="Public", | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(elem_classes=["group-border"]): | |
| with gr.Accordion("Serving Container", open=False, elem_classes=["no-border"]): | |
| with gr.Column(): | |
| gr.Markdown("""### Container Type | |
| Text Generation Inference is an optimized container for text generation task""") | |
| _ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"]) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Custom Cuda Kernels | |
| TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""") | |
| custom_kernel = gr.Dropdown( | |
| value="Enabled", | |
| choices=["Enabled", "Disabled"], | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Quantization | |
| Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""") | |
| _ = gr.Dropdown( | |
| value="None", | |
| choices=["None", "Bitsandbytes", "GPTQ"], | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Max Input Length (per Query) | |
| Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""") | |
| max_input_length = gr.Number( | |
| value=1024, | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Max Number of Tokens (per Query) | |
| The larger this value, the more memory each request will consume and the less effective batching can be.""") | |
| max_tokens = gr.Number( | |
| value=1512, | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("""### Max Batch Prefill Tokens | |
| Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""") | |
| max_batch_prefill_token = gr.Number( | |
| value=2048, | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("""### Max Batch Total Tokens | |
| Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""") | |
| max_batch_total_token = gr.Number( | |
| value=None, | |
| interactive=True, | |
| show_label=False, | |
| elem_classes=["no-label", "small-big"] | |
| ) | |
| submit_button = gr.Button( | |
| value="Submit", | |
| elem_classes=["control-label-font", "control-button"] | |
| ) | |
| status_txt = gr.Textbox( | |
| value="any status update will be displayed here", | |
| interactive=False, | |
| elem_classes=["no-label"] | |
| ) | |
| provider_selector.change(update_regions, inputs=provider_selector, outputs=region_selector) | |
| region_selector.change(update_compute_options, inputs=[provider_selector, region_selector], outputs=compute_selector) | |
| submit_button.click( | |
| submit, | |
| inputs=[ | |
| hf_account_input, | |
| hf_token_input, | |
| endpoint_name_input, | |
| provider_selector, | |
| region_selector, | |
| repository_selector, | |
| task_selector, | |
| framework_selector, | |
| compute_selector, | |
| min_node_selector, | |
| max_node_selector, | |
| security_selector, | |
| custom_kernel, | |
| max_input_length, | |
| max_tokens, | |
| max_batch_prefill_token, | |
| max_batch_total_token], | |
| outputs=status_txt) | |
| with gr.Tab("AWS", elem_classes=["no-border"]): | |
| gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"]) | |
| with gr.Tab("GCP", elem_classes=["no-border"]): | |
| gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"]) | |
| with gr.Tab("Azure", elem_classes=["no-border"]): | |
| gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"]) | |
| with gr.Tab("Lambdalabs", elem_classes=["no-border"]): | |
| gr.Markdown("# Deploy LLM on 馃 Hugging Face Inference Endpoint", elem_classes=["center"]) | |
| hf_endpoint.launch(enable_queue=True, debug=True) | |