Spaces:

saashley
/

capitolati-rag

Sleeping

App Files Files Community

saashley commited on Jul 14

Commit

fa62106

verified ·

1 Parent(s): 4c18db7

Update neo4j_config.py

Browse files

Files changed (1) hide show

neo4j_config.py +1 -106

neo4j_config.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import re
-from neo4j import GraphDatabase
 import os
 URI = os.getenv("NEO4J_URI")
@@ -8,107 +6,4 @@ PASSWORD = os.getenv("NEO4J_PASSWORD")
 AUTH = (USER, PASSWORD)
 if not all([URI, USER, PASSWORD]):
-    raise RuntimeError("Missing one or more Neo4j environment variables.")
-with GraphDatabase.driver(URI, auth=AUTH) as driver:
-    driver.verify_connectivity()
-def normalize_int(value, default=0):
-    """
-    Safely convert value to int.
-    - If already int, return it.
-    - If str of digits, parse it.
-    - Otherwise return `default`.
-    """
-    if isinstance(value, int):
-        return value
-    if isinstance(value, str) and value.isdigit():
-        return int(value)
-    # optionally, extract digits from strings like "1.":
-    m = re.match(r"(\d+)", str(value))
-    if m:
-        return int(m.group(1))
-    return default
-def add_municipality(tx, municipality):
-    tx.run("""
-        MERGE (m:Municipality {name: $municipality})
-    """, municipality=municipality)
-def add_document(tx, doc_id, municipality):
-    tx.run("""
-        MATCH (m:Municipality {name: $municipality})
-        MERGE (d:Document {doc_id: $doc_id})
-        MERGE (m)-[:HAS_DOCUMENT]->(d)
-    """, municipality=municipality, doc_id=doc_id)
-def add_chunk(tx, chunk):
-    tx.run("""
-        MATCH (d:Document {doc_id: $doc_id})
-        MERGE (c:Chunk {id: $id})
-        SET c.page = $page,
-            c.section = $section,
-            c.level = $level,
-            c.text = $text,
-            c.embedding = $embedding
-        MERGE (d)-[:HAS_CHUNK]->(c)
-    """, id=chunk["id"], doc_id=chunk["document_id"],
-         page=chunk["page"], section=chunk["section"],
-         level=chunk["level"], text=chunk["chunk_text"],
-         embedding=chunk["embedding"])
-def link_parent(tx, parent_id, child_id):
-    tx.run("""
-        MATCH (p:Chunk {id: $parent_id}), (c:Chunk {id: $child_id})
-        MERGE (p)-[:HAS_SUBSECTION]->(c)
-    """, parent_id=parent_id, child_id=child_id)
-def link_sibling(tx, sibling1_id, sibling2_id):
-    tx.run("""
-        MATCH (c1:Chunk {id: $sibling1_id}), (c2:Chunk {id: $sibling2_id})
-        MERGE (c1)-[:NEXT_TO]->(c2)
-    """, sibling1_id=sibling1_id, sibling2_id=sibling2_id)
-# takes again quite some time to compute, we could re-download a pkl file with ids as well
-def sync_chunk_ids(all_chunks, driver, prefix_len=50):
-    """
-    For each chunk in-memory, look up its real DB id by matching on:
-      - page
-      - section
-      - the first `prefix_len` chars of text
-    If already present, overwrites chunk["id"] with the DB value when found,
-    otherwise retrieves the id from the graph db and adds it to each chunk's dict.
-    """
-    with driver.session() as session:
-        for chunk in all_chunks:
-            # build prefix of the chunk text
-            prefix = chunk["chunk_text"][:prefix_len]
-            # normalize numeric props
-            page = normalize_int(chunk.get("page"))
-            cypher = """
-            MATCH (c:Chunk {
-              page: $page,
-              section: $section
-            })
-            WHERE c.text STARTS WITH $prefix
-            RETURN c.id AS real_id
-            LIMIT 1
-            """
-            params = {
-                "page": page,
-                "section": chunk["section"],
-                "prefix": prefix
-            }
-            rec = session.run(cypher, params).single()
-            if rec:
-                chunk["id"] = rec["real_id"]
-            else:
-                print(f"No DB match for chunk: page={page} "
-                      f"section={chunk.get('section')!r} prefix={prefix!r}")
-## CHUNK INGESTION CODE NOT PRESENT HERE!! CHECK COLAB NB!

 import os
 URI = os.getenv("NEO4J_URI")
 AUTH = (USER, PASSWORD)
 if not all([URI, USER, PASSWORD]):
+    raise RuntimeError("Missing one or more Neo4j environment variables.")