Spaces:
Running
on
Zero
Running
on
Zero
Fixed categorical colors.
Browse files- app.py +83 -44
- openalex_utils.py +14 -3
app.py
CHANGED
|
@@ -558,8 +558,9 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 558 |
break
|
| 559 |
|
| 560 |
if should_break_current_query:
|
| 561 |
-
print(f"Successfully
|
| 562 |
-
|
|
|
|
| 563 |
# Continue to next query - don't break out of the main query loop
|
| 564 |
print(f"Query completed in {time.time() - start_time:.2f} seconds")
|
| 565 |
print(f"Total records collected: {len(records)}")
|
|
@@ -576,6 +577,7 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 576 |
|
| 577 |
# Add query_index to the dataframe
|
| 578 |
records_df['query_index'] = query_indices[:len(records_df)]
|
|
|
|
| 579 |
|
| 580 |
if reduce_sample_checkbox and sample_reduction_method != "All" and sample_reduction_method != "n random samples":
|
| 581 |
# Note: We skip "n random samples" here because PyAlex sampling is already done above
|
|
@@ -611,7 +613,9 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 611 |
if sample_reduction_method == "First n samples":
|
| 612 |
records_df = records_df.iloc[:sample_size]
|
| 613 |
print(f"Records processed in {time.time() - processing_start:.2f} seconds")
|
| 614 |
-
|
|
|
|
|
|
|
| 615 |
# Create embeddings - this happens regardless of data source
|
| 616 |
embedding_start = time.time()
|
| 617 |
progress(0.3, desc="Embedding Data...")
|
|
@@ -655,7 +659,7 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 655 |
print('Highlight color:', highlight_color)
|
| 656 |
|
| 657 |
# Check if we have multiple queries and categorical coloring is enabled
|
| 658 |
-
urls
|
| 659 |
has_multiple_queries = len(urls) > 1 and not csv_upload
|
| 660 |
|
| 661 |
if treat_as_categorical_checkbox and has_multiple_queries:
|
|
@@ -677,45 +681,29 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 677 |
print(f"Warning: Could not load colormap '{selected_colormap_name}' for categorical coloring: {e}")
|
| 678 |
# Fallback to default categorical colors
|
| 679 |
categorical_colors = [
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
'#fc8d62', # Light Orange
|
| 691 |
-
'#8da0cb', # Light Blue
|
| 692 |
-
'#e78ac3', # Light Pink
|
| 693 |
-
'#a6d854', # Light Green
|
| 694 |
-
'#ffd92f', # Light Yellow
|
| 695 |
-
'#e5c494', # Beige
|
| 696 |
-
'#b3b3b3', # Light Gray
|
| 697 |
-
]
|
| 698 |
else:
|
| 699 |
# Use default categorical colors
|
| 700 |
categorical_colors = [
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
'#fc8d62', # Light Orange
|
| 712 |
-
'#8da0cb', # Light Blue
|
| 713 |
-
'#e78ac3', # Light Pink
|
| 714 |
-
'#a6d854', # Light Green
|
| 715 |
-
'#ffd92f', # Light Yellow
|
| 716 |
-
'#e5c494', # Beige
|
| 717 |
-
'#b3b3b3', # Light Gray
|
| 718 |
-
]
|
| 719 |
|
| 720 |
# Assign colors based on query_index
|
| 721 |
query_color_map = {query_idx: categorical_colors[i % len(categorical_colors)]
|
|
@@ -813,19 +801,39 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 813 |
color_mapping = {}
|
| 814 |
|
| 815 |
# Get readable names for each query URL
|
|
|
|
| 816 |
for i, query_idx in enumerate(unique_queries):
|
| 817 |
try:
|
| 818 |
if query_idx < len(urls):
|
| 819 |
readable_name = openalex_url_to_readable_name(urls[query_idx])
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
|
|
|
|
|
|
| 823 |
else:
|
| 824 |
readable_name = f"Query {query_idx + 1}"
|
| 825 |
-
except Exception:
|
| 826 |
readable_name = f"Query {query_idx + 1}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
|
|
|
|
| 828 |
color_mapping[readable_name] = query_color_map[query_idx]
|
|
|
|
|
|
|
|
|
|
| 829 |
|
| 830 |
legend_html, legend_css = categorical_legend_html_css(
|
| 831 |
color_mapping,
|
|
@@ -1043,6 +1051,37 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 1043 |
alpha=0.8,
|
| 1044 |
s=point_size
|
| 1045 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1046 |
print(f"Scatter plot creation completed in {time.time() - scatter_start:.2f} seconds")
|
| 1047 |
|
| 1048 |
# Save plot
|
|
|
|
| 558 |
break
|
| 559 |
|
| 560 |
if should_break_current_query:
|
| 561 |
+
print(f"Successfully downloaded target size for query {i+1}, moving to next query")
|
| 562 |
+
# Continue to next query instead of breaking the entire query loop
|
| 563 |
+
continue
|
| 564 |
# Continue to next query - don't break out of the main query loop
|
| 565 |
print(f"Query completed in {time.time() - start_time:.2f} seconds")
|
| 566 |
print(f"Total records collected: {len(records)}")
|
|
|
|
| 577 |
|
| 578 |
# Add query_index to the dataframe
|
| 579 |
records_df['query_index'] = query_indices[:len(records_df)]
|
| 580 |
+
|
| 581 |
|
| 582 |
if reduce_sample_checkbox and sample_reduction_method != "All" and sample_reduction_method != "n random samples":
|
| 583 |
# Note: We skip "n random samples" here because PyAlex sampling is already done above
|
|
|
|
| 613 |
if sample_reduction_method == "First n samples":
|
| 614 |
records_df = records_df.iloc[:sample_size]
|
| 615 |
print(f"Records processed in {time.time() - processing_start:.2f} seconds")
|
| 616 |
+
|
| 617 |
+
print(query_indices)
|
| 618 |
+
print(records_df)
|
| 619 |
# Create embeddings - this happens regardless of data source
|
| 620 |
embedding_start = time.time()
|
| 621 |
progress(0.3, desc="Embedding Data...")
|
|
|
|
| 659 |
print('Highlight color:', highlight_color)
|
| 660 |
|
| 661 |
# Check if we have multiple queries and categorical coloring is enabled
|
| 662 |
+
# Note: urls was already parsed earlier in the function, so we should use that
|
| 663 |
has_multiple_queries = len(urls) > 1 and not csv_upload
|
| 664 |
|
| 665 |
if treat_as_categorical_checkbox and has_multiple_queries:
|
|
|
|
| 681 |
print(f"Warning: Could not load colormap '{selected_colormap_name}' for categorical coloring: {e}")
|
| 682 |
# Fallback to default categorical colors
|
| 683 |
categorical_colors = [
|
| 684 |
+
"#80418F", # Plum
|
| 685 |
+
"#EDA958", # Earth Yellow
|
| 686 |
+
"#F35264", # Crayola Red
|
| 687 |
+
"#087CA7", # Cerulean
|
| 688 |
+
"#FA826B", # Salmon
|
| 689 |
+
"#475C8F", # Navy Blue
|
| 690 |
+
"#579DA3", # Moonstone Green
|
| 691 |
+
"#d61d22", # Bright Red
|
| 692 |
+
"#97bb3c", # Lime Green
|
| 693 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
else:
|
| 695 |
# Use default categorical colors
|
| 696 |
categorical_colors = [
|
| 697 |
+
"#80418F", # Plum
|
| 698 |
+
"#EDA958", # Earth Yellow
|
| 699 |
+
"#F35264", # Crayola Red
|
| 700 |
+
"#087CA7", # Cerulean
|
| 701 |
+
"#FA826B", # Salmon
|
| 702 |
+
"#475C8F", # Navy Blue
|
| 703 |
+
"#579DA3", # Moonstone Green
|
| 704 |
+
"#d61d22", # Bright Red
|
| 705 |
+
"#97bb3c", # Lime Green
|
| 706 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
|
| 708 |
# Assign colors based on query_index
|
| 709 |
query_color_map = {query_idx: categorical_colors[i % len(categorical_colors)]
|
|
|
|
| 801 |
color_mapping = {}
|
| 802 |
|
| 803 |
# Get readable names for each query URL
|
| 804 |
+
used_names = set() # Track used names to ensure uniqueness
|
| 805 |
for i, query_idx in enumerate(unique_queries):
|
| 806 |
try:
|
| 807 |
if query_idx < len(urls):
|
| 808 |
readable_name = openalex_url_to_readable_name(urls[query_idx])
|
| 809 |
+
print(f"Query {query_idx}: Original readable name: '{readable_name}'")
|
| 810 |
+
# Truncate long names for legend display (increased from 25 to 40 chars)
|
| 811 |
+
if len(readable_name) > 40:
|
| 812 |
+
readable_name = readable_name[:37] + "..."
|
| 813 |
+
print(f"Query {query_idx}: Truncated to: '{readable_name}'")
|
| 814 |
else:
|
| 815 |
readable_name = f"Query {query_idx + 1}"
|
| 816 |
+
except Exception as e:
|
| 817 |
readable_name = f"Query {query_idx + 1}"
|
| 818 |
+
print(f"Query {query_idx}: Exception generating name: {e}")
|
| 819 |
+
|
| 820 |
+
# Ensure uniqueness - if name is already used, append query number
|
| 821 |
+
original_name = readable_name
|
| 822 |
+
counter = 1
|
| 823 |
+
while readable_name in used_names:
|
| 824 |
+
print(f"Query {query_idx}: Name '{readable_name}' already used, making unique...")
|
| 825 |
+
readable_name = f"{original_name} ({query_idx + 1})"
|
| 826 |
+
if len(readable_name) > 40:
|
| 827 |
+
# Re-truncate if needed after adding query number
|
| 828 |
+
base_name = original_name[:32] + "..."
|
| 829 |
+
readable_name = f"{base_name} ({query_idx + 1})"
|
| 830 |
+
counter += 1
|
| 831 |
|
| 832 |
+
used_names.add(readable_name)
|
| 833 |
color_mapping[readable_name] = query_color_map[query_idx]
|
| 834 |
+
print(f"Query {query_idx}: Final legend name: '{readable_name}' -> color: {query_color_map[query_idx]}")
|
| 835 |
+
|
| 836 |
+
print(f"Final color mapping: {color_mapping}")
|
| 837 |
|
| 838 |
legend_html, legend_css = categorical_legend_html_css(
|
| 839 |
color_mapping,
|
|
|
|
| 1051 |
alpha=0.8,
|
| 1052 |
s=point_size
|
| 1053 |
)
|
| 1054 |
+
# Add legend for categorical coloring (not time-based)
|
| 1055 |
+
if plot_type_dropdown != "Time-based coloring" and treat_as_categorical_checkbox and has_multiple_queries:
|
| 1056 |
+
# Get unique categories and their colors from the color mapping dict
|
| 1057 |
+
unique_categories = records_df['query_index'].unique()
|
| 1058 |
+
|
| 1059 |
+
# Create legend handles with larger point size using the color mapping
|
| 1060 |
+
legend_handles = []
|
| 1061 |
+
for query_idx in sorted(unique_categories):
|
| 1062 |
+
# Get the readable name for this query
|
| 1063 |
+
try:
|
| 1064 |
+
if query_idx < len(urls):
|
| 1065 |
+
readable_name = openalex_url_to_readable_name(urls[query_idx])
|
| 1066 |
+
# Truncate long names for legend display
|
| 1067 |
+
if len(readable_name) > 40:
|
| 1068 |
+
readable_name = readable_name[:37] + "..."
|
| 1069 |
+
else:
|
| 1070 |
+
readable_name = f"Query {query_idx + 1}"
|
| 1071 |
+
except Exception as e:
|
| 1072 |
+
readable_name = f"Query {query_idx + 1}"
|
| 1073 |
+
|
| 1074 |
+
color = query_color_map[query_idx]
|
| 1075 |
+
legend_handles.append(plt.Line2D([0], [0], marker='o', color='w',
|
| 1076 |
+
markerfacecolor=color, markersize=9,
|
| 1077 |
+
label=readable_name, linestyle='None'))
|
| 1078 |
+
|
| 1079 |
+
# Add legend in upper left corner
|
| 1080 |
+
plt.legend(handles=legend_handles, loc='upper left', frameon=False,
|
| 1081 |
+
fancybox=False, shadow=False, framealpha=0.9, fontsize=9,
|
| 1082 |
+
#prop={'weight': 'bold'}
|
| 1083 |
+
)
|
| 1084 |
+
|
| 1085 |
print(f"Scatter plot creation completed in {time.time() - scatter_start:.2f} seconds")
|
| 1086 |
|
| 1087 |
# Save plot
|
openalex_utils.py
CHANGED
|
@@ -258,6 +258,12 @@ def openalex_url_to_readable_name(url):
|
|
| 258 |
search_term = value.strip('"\'')
|
| 259 |
parts.append(f"Search: '{search_term}'")
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
elif key == 'publication_year':
|
| 262 |
# Handle year ranges or single years
|
| 263 |
if '-' in value:
|
|
@@ -348,8 +354,13 @@ def openalex_url_to_readable_name(url):
|
|
| 348 |
|
| 349 |
else:
|
| 350 |
# Generic handling for other filters
|
|
|
|
| 351 |
clean_key = key.replace('_', ' ').replace('.', ' ').title()
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
parts.append(f"{clean_key}: {clean_value}")
|
| 354 |
|
| 355 |
except Exception as e:
|
|
@@ -370,7 +381,7 @@ def openalex_url_to_readable_name(url):
|
|
| 370 |
description = f"Works from {year_range}"
|
| 371 |
|
| 372 |
# Limit length to keep it readable
|
| 373 |
-
if len(description) >
|
| 374 |
-
description = description[:
|
| 375 |
|
| 376 |
return description
|
|
|
|
| 258 |
search_term = value.strip('"\'')
|
| 259 |
parts.append(f"Search: '{search_term}'")
|
| 260 |
|
| 261 |
+
elif key == 'title_and_abstract.search':
|
| 262 |
+
# Handle title and abstract search specifically
|
| 263 |
+
from urllib.parse import unquote_plus
|
| 264 |
+
search_term = unquote_plus(value).strip('"\'')
|
| 265 |
+
parts.append(f"T&A: '{search_term}'")
|
| 266 |
+
|
| 267 |
elif key == 'publication_year':
|
| 268 |
# Handle year ranges or single years
|
| 269 |
if '-' in value:
|
|
|
|
| 354 |
|
| 355 |
else:
|
| 356 |
# Generic handling for other filters
|
| 357 |
+
from urllib.parse import unquote_plus
|
| 358 |
clean_key = key.replace('_', ' ').replace('.', ' ').title()
|
| 359 |
+
# Properly decode URL-encoded values
|
| 360 |
+
try:
|
| 361 |
+
clean_value = unquote_plus(value).replace('_', ' ')
|
| 362 |
+
except:
|
| 363 |
+
clean_value = value.replace('_', ' ')
|
| 364 |
parts.append(f"{clean_key}: {clean_value}")
|
| 365 |
|
| 366 |
except Exception as e:
|
|
|
|
| 381 |
description = f"Works from {year_range}"
|
| 382 |
|
| 383 |
# Limit length to keep it readable
|
| 384 |
+
if len(description) > 60:
|
| 385 |
+
description = description[:57] + "..."
|
| 386 |
|
| 387 |
return description
|