Spaces:
Runtime error
Runtime error
add track_iter
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from itertools import count, islice
|
| 2 |
-
from typing import Any, Iterable
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import requests
|
|
@@ -9,8 +9,8 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
|
| 9 |
|
| 10 |
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
| 11 |
|
| 12 |
-
MAX_ENTITIES = 100
|
| 13 |
MAX_ROWS = 100
|
|
|
|
| 14 |
|
| 15 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
| 16 |
batch_size = 100
|
|
@@ -23,6 +23,17 @@ def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any
|
|
| 23 |
for row_item in rows_resp["rows"]:
|
| 24 |
yield row_item["row"]
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
| 27 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 28 |
if "error" in info_resp:
|
|
@@ -31,17 +42,18 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
|
|
| 31 |
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
| 32 |
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
|
| 33 |
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
|
|
|
|
| 34 |
scanned_columns = get_columns_with_strings(features)
|
| 35 |
columns_descriptions = [
|
| 36 |
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
|
| 37 |
]
|
| 38 |
-
rows = islice(stream_rows(dataset, config, split), MAX_ROWS)
|
| 39 |
presidio_entities = []
|
| 40 |
-
for presidio_entity in
|
| 41 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 42 |
-
)
|
| 43 |
presidio_entities.append(presidio_entity)
|
| 44 |
-
yield f"
|
| 45 |
|
| 46 |
demo = gr.Interface(
|
| 47 |
fn=analyze_dataset,
|
|
|
|
| 1 |
from itertools import count, islice
|
| 2 |
+
from typing import Any, Iterable, TypedVar
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import requests
|
|
|
|
| 9 |
|
| 10 |
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
| 11 |
|
|
|
|
| 12 |
MAX_ROWS = 100
|
| 13 |
+
T = TypedVar("T")
|
| 14 |
|
| 15 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
| 16 |
batch_size = 100
|
|
|
|
| 23 |
for row_item in rows_resp["rows"]:
|
| 24 |
yield row_item["row"]
|
| 25 |
|
| 26 |
+
class track_iter:
|
| 27 |
+
|
| 28 |
+
def __init__(self, it: Iterable[T]):
|
| 29 |
+
self.it = it
|
| 30 |
+
self.next_idx = 0
|
| 31 |
+
|
| 32 |
+
def __iter__(self) -> T:
|
| 33 |
+
for item in self.it:
|
| 34 |
+
self.next_idx += 1
|
| 35 |
+
yield item
|
| 36 |
+
|
| 37 |
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
| 38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 39 |
if "error" in info_resp:
|
|
|
|
| 42 |
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
| 43 |
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
|
| 44 |
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
|
| 45 |
+
num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
|
| 46 |
scanned_columns = get_columns_with_strings(features)
|
| 47 |
columns_descriptions = [
|
| 48 |
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
|
| 49 |
]
|
| 50 |
+
rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
|
| 51 |
presidio_entities = []
|
| 52 |
+
for presidio_entity in presidio_scan_entities(
|
| 53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 54 |
+
):
|
| 55 |
presidio_entities.append(presidio_entity)
|
| 56 |
+
yield f"Scanning {dataset} [{rows.next_idx} / {num_rows}]:", pd.DataFrame(presidio_entities)
|
| 57 |
|
| 58 |
demo = gr.Interface(
|
| 59 |
fn=analyze_dataset,
|