saashley commited on
Commit
fa62106
·
verified ·
1 Parent(s): 4c18db7

Update neo4j_config.py

Browse files
Files changed (1) hide show
  1. neo4j_config.py +1 -106
neo4j_config.py CHANGED
@@ -1,5 +1,3 @@
1
- import re
2
- from neo4j import GraphDatabase
3
  import os
4
 
5
  URI = os.getenv("NEO4J_URI")
@@ -8,107 +6,4 @@ PASSWORD = os.getenv("NEO4J_PASSWORD")
8
  AUTH = (USER, PASSWORD)
9
 
10
  if not all([URI, USER, PASSWORD]):
11
- raise RuntimeError("Missing one or more Neo4j environment variables.")
12
-
13
- with GraphDatabase.driver(URI, auth=AUTH) as driver:
14
- driver.verify_connectivity()
15
-
16
-
17
- def normalize_int(value, default=0):
18
- """
19
- Safely convert value to int.
20
- - If already int, return it.
21
- - If str of digits, parse it.
22
- - Otherwise return `default`.
23
- """
24
- if isinstance(value, int):
25
- return value
26
- if isinstance(value, str) and value.isdigit():
27
- return int(value)
28
- # optionally, extract digits from strings like "1.":
29
- m = re.match(r"(\d+)", str(value))
30
- if m:
31
- return int(m.group(1))
32
- return default
33
-
34
-
35
- def add_municipality(tx, municipality):
36
- tx.run("""
37
- MERGE (m:Municipality {name: $municipality})
38
- """, municipality=municipality)
39
-
40
- def add_document(tx, doc_id, municipality):
41
- tx.run("""
42
- MATCH (m:Municipality {name: $municipality})
43
- MERGE (d:Document {doc_id: $doc_id})
44
- MERGE (m)-[:HAS_DOCUMENT]->(d)
45
- """, municipality=municipality, doc_id=doc_id)
46
-
47
- def add_chunk(tx, chunk):
48
- tx.run("""
49
- MATCH (d:Document {doc_id: $doc_id})
50
- MERGE (c:Chunk {id: $id})
51
- SET c.page = $page,
52
- c.section = $section,
53
- c.level = $level,
54
- c.text = $text,
55
- c.embedding = $embedding
56
- MERGE (d)-[:HAS_CHUNK]->(c)
57
- """, id=chunk["id"], doc_id=chunk["document_id"],
58
- page=chunk["page"], section=chunk["section"],
59
- level=chunk["level"], text=chunk["chunk_text"],
60
- embedding=chunk["embedding"])
61
-
62
- def link_parent(tx, parent_id, child_id):
63
- tx.run("""
64
- MATCH (p:Chunk {id: $parent_id}), (c:Chunk {id: $child_id})
65
- MERGE (p)-[:HAS_SUBSECTION]->(c)
66
- """, parent_id=parent_id, child_id=child_id)
67
-
68
- def link_sibling(tx, sibling1_id, sibling2_id):
69
- tx.run("""
70
- MATCH (c1:Chunk {id: $sibling1_id}), (c2:Chunk {id: $sibling2_id})
71
- MERGE (c1)-[:NEXT_TO]->(c2)
72
- """, sibling1_id=sibling1_id, sibling2_id=sibling2_id)
73
-
74
- # takes again quite some time to compute, we could re-download a pkl file with ids as well
75
- def sync_chunk_ids(all_chunks, driver, prefix_len=50):
76
- """
77
- For each chunk in-memory, look up its real DB id by matching on:
78
- - page
79
- - section
80
- - the first `prefix_len` chars of text
81
-
82
- If already present, overwrites chunk["id"] with the DB value when found,
83
- otherwise retrieves the id from the graph db and adds it to each chunk's dict.
84
- """
85
- with driver.session() as session:
86
- for chunk in all_chunks:
87
- # build prefix of the chunk text
88
- prefix = chunk["chunk_text"][:prefix_len]
89
- # normalize numeric props
90
- page = normalize_int(chunk.get("page"))
91
-
92
- cypher = """
93
- MATCH (c:Chunk {
94
- page: $page,
95
- section: $section
96
- })
97
- WHERE c.text STARTS WITH $prefix
98
- RETURN c.id AS real_id
99
- LIMIT 1
100
- """
101
- params = {
102
- "page": page,
103
- "section": chunk["section"],
104
- "prefix": prefix
105
- }
106
-
107
- rec = session.run(cypher, params).single()
108
- if rec:
109
- chunk["id"] = rec["real_id"]
110
- else:
111
- print(f"No DB match for chunk: page={page} "
112
- f"section={chunk.get('section')!r} prefix={prefix!r}")
113
-
114
- ## CHUNK INGESTION CODE NOT PRESENT HERE!! CHECK COLAB NB!
 
 
 
1
  import os
2
 
3
  URI = os.getenv("NEO4J_URI")
 
6
  AUTH = (USER, PASSWORD)
7
 
8
  if not all([URI, USER, PASSWORD]):
9
+ raise RuntimeError("Missing one or more Neo4j environment variables.")