Spaces:
Sleeping
Sleeping
Update neo4j_config.py
Browse files- neo4j_config.py +1 -106
neo4j_config.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
import re
|
| 2 |
-
from neo4j import GraphDatabase
|
| 3 |
import os
|
| 4 |
|
| 5 |
URI = os.getenv("NEO4J_URI")
|
|
@@ -8,107 +6,4 @@ PASSWORD = os.getenv("NEO4J_PASSWORD")
|
|
| 8 |
AUTH = (USER, PASSWORD)
|
| 9 |
|
| 10 |
if not all([URI, USER, PASSWORD]):
|
| 11 |
-
raise RuntimeError("Missing one or more Neo4j environment variables.")
|
| 12 |
-
|
| 13 |
-
with GraphDatabase.driver(URI, auth=AUTH) as driver:
|
| 14 |
-
driver.verify_connectivity()
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def normalize_int(value, default=0):
|
| 18 |
-
"""
|
| 19 |
-
Safely convert value to int.
|
| 20 |
-
- If already int, return it.
|
| 21 |
-
- If str of digits, parse it.
|
| 22 |
-
- Otherwise return `default`.
|
| 23 |
-
"""
|
| 24 |
-
if isinstance(value, int):
|
| 25 |
-
return value
|
| 26 |
-
if isinstance(value, str) and value.isdigit():
|
| 27 |
-
return int(value)
|
| 28 |
-
# optionally, extract digits from strings like "1.":
|
| 29 |
-
m = re.match(r"(\d+)", str(value))
|
| 30 |
-
if m:
|
| 31 |
-
return int(m.group(1))
|
| 32 |
-
return default
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def add_municipality(tx, municipality):
|
| 36 |
-
tx.run("""
|
| 37 |
-
MERGE (m:Municipality {name: $municipality})
|
| 38 |
-
""", municipality=municipality)
|
| 39 |
-
|
| 40 |
-
def add_document(tx, doc_id, municipality):
|
| 41 |
-
tx.run("""
|
| 42 |
-
MATCH (m:Municipality {name: $municipality})
|
| 43 |
-
MERGE (d:Document {doc_id: $doc_id})
|
| 44 |
-
MERGE (m)-[:HAS_DOCUMENT]->(d)
|
| 45 |
-
""", municipality=municipality, doc_id=doc_id)
|
| 46 |
-
|
| 47 |
-
def add_chunk(tx, chunk):
|
| 48 |
-
tx.run("""
|
| 49 |
-
MATCH (d:Document {doc_id: $doc_id})
|
| 50 |
-
MERGE (c:Chunk {id: $id})
|
| 51 |
-
SET c.page = $page,
|
| 52 |
-
c.section = $section,
|
| 53 |
-
c.level = $level,
|
| 54 |
-
c.text = $text,
|
| 55 |
-
c.embedding = $embedding
|
| 56 |
-
MERGE (d)-[:HAS_CHUNK]->(c)
|
| 57 |
-
""", id=chunk["id"], doc_id=chunk["document_id"],
|
| 58 |
-
page=chunk["page"], section=chunk["section"],
|
| 59 |
-
level=chunk["level"], text=chunk["chunk_text"],
|
| 60 |
-
embedding=chunk["embedding"])
|
| 61 |
-
|
| 62 |
-
def link_parent(tx, parent_id, child_id):
|
| 63 |
-
tx.run("""
|
| 64 |
-
MATCH (p:Chunk {id: $parent_id}), (c:Chunk {id: $child_id})
|
| 65 |
-
MERGE (p)-[:HAS_SUBSECTION]->(c)
|
| 66 |
-
""", parent_id=parent_id, child_id=child_id)
|
| 67 |
-
|
| 68 |
-
def link_sibling(tx, sibling1_id, sibling2_id):
|
| 69 |
-
tx.run("""
|
| 70 |
-
MATCH (c1:Chunk {id: $sibling1_id}), (c2:Chunk {id: $sibling2_id})
|
| 71 |
-
MERGE (c1)-[:NEXT_TO]->(c2)
|
| 72 |
-
""", sibling1_id=sibling1_id, sibling2_id=sibling2_id)
|
| 73 |
-
|
| 74 |
-
# takes again quite some time to compute, we could re-download a pkl file with ids as well
|
| 75 |
-
def sync_chunk_ids(all_chunks, driver, prefix_len=50):
|
| 76 |
-
"""
|
| 77 |
-
For each chunk in-memory, look up its real DB id by matching on:
|
| 78 |
-
- page
|
| 79 |
-
- section
|
| 80 |
-
- the first `prefix_len` chars of text
|
| 81 |
-
|
| 82 |
-
If already present, overwrites chunk["id"] with the DB value when found,
|
| 83 |
-
otherwise retrieves the id from the graph db and adds it to each chunk's dict.
|
| 84 |
-
"""
|
| 85 |
-
with driver.session() as session:
|
| 86 |
-
for chunk in all_chunks:
|
| 87 |
-
# build prefix of the chunk text
|
| 88 |
-
prefix = chunk["chunk_text"][:prefix_len]
|
| 89 |
-
# normalize numeric props
|
| 90 |
-
page = normalize_int(chunk.get("page"))
|
| 91 |
-
|
| 92 |
-
cypher = """
|
| 93 |
-
MATCH (c:Chunk {
|
| 94 |
-
page: $page,
|
| 95 |
-
section: $section
|
| 96 |
-
})
|
| 97 |
-
WHERE c.text STARTS WITH $prefix
|
| 98 |
-
RETURN c.id AS real_id
|
| 99 |
-
LIMIT 1
|
| 100 |
-
"""
|
| 101 |
-
params = {
|
| 102 |
-
"page": page,
|
| 103 |
-
"section": chunk["section"],
|
| 104 |
-
"prefix": prefix
|
| 105 |
-
}
|
| 106 |
-
|
| 107 |
-
rec = session.run(cypher, params).single()
|
| 108 |
-
if rec:
|
| 109 |
-
chunk["id"] = rec["real_id"]
|
| 110 |
-
else:
|
| 111 |
-
print(f"No DB match for chunk: page={page} "
|
| 112 |
-
f"section={chunk.get('section')!r} prefix={prefix!r}")
|
| 113 |
-
|
| 114 |
-
## CHUNK INGESTION CODE NOT PRESENT HERE!! CHECK COLAB NB!
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
URI = os.getenv("NEO4J_URI")
|
|
|
|
| 6 |
AUTH = (USER, PASSWORD)
|
| 7 |
|
| 8 |
if not all([URI, USER, PASSWORD]):
|
| 9 |
+
raise RuntimeError("Missing one or more Neo4j environment variables.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|