richlai commited on
Commit
0b1170c
·
1 Parent(s): 891277f
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. aimakerspace/text_utils.py +19 -1
  3. app.py +25 -4
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM python:3.9
2
  RUN useradd -m -u 1000 user
3
  USER user
4
  ENV HOME=/home/user \
 
1
+ FROM python:3.11
2
  RUN useradd -m -u 1000 user
3
  USER user
4
  ENV HOME=/home/user \
aimakerspace/text_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from typing import List
 
3
 
4
 
5
  class TextFileLoader:
@@ -13,6 +14,8 @@ class TextFileLoader:
13
  self.load_directory()
14
  elif os.path.isfile(self.path) and self.path.endswith(".txt"):
15
  self.load_file()
 
 
16
  else:
17
  raise ValueError(
18
  "Provided path is neither a valid directory nor a .txt file."
@@ -22,6 +25,13 @@ class TextFileLoader:
22
  with open(self.path, "r", encoding=self.encoding) as f:
23
  self.documents.append(f.read())
24
 
 
 
 
 
 
 
 
25
  def load_directory(self):
26
  for root, _, files in os.walk(self.path):
27
  for file in files:
@@ -30,6 +40,14 @@ class TextFileLoader:
30
  os.path.join(root, file), "r", encoding=self.encoding
31
  ) as f:
32
  self.documents.append(f.read())
 
 
 
 
 
 
 
 
33
 
34
  def load_documents(self):
35
  self.load()
@@ -74,4 +92,4 @@ if __name__ == "__main__":
74
  print("--------")
75
  print(chunks[-2])
76
  print("--------")
77
- print(chunks[-1])
 
1
  import os
2
  from typing import List
3
+ import fitz
4
 
5
 
6
  class TextFileLoader:
 
14
  self.load_directory()
15
  elif os.path.isfile(self.path) and self.path.endswith(".txt"):
16
  self.load_file()
17
+ elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
18
+ self.load_pdf()
19
  else:
20
  raise ValueError(
21
  "Provided path is neither a valid directory nor a .txt file."
 
25
  with open(self.path, "r", encoding=self.encoding) as f:
26
  self.documents.append(f.read())
27
 
28
+ def load_pdf(self):
29
+ doc = fitz.open(self.path)
30
+ text = ""
31
+ for page in doc:
32
+ text += page.get_text()
33
+ self.documents.append(text)
34
+
35
  def load_directory(self):
36
  for root, _, files in os.walk(self.path):
37
  for file in files:
 
40
  os.path.join(root, file), "r", encoding=self.encoding
41
  ) as f:
42
  self.documents.append(f.read())
43
+ if file.endswith(".pdf"):
44
+ doc = fitz.open(os.path.join(root, file))
45
+ text = ""
46
+ for page in doc:
47
+ text += page.get_text()
48
+ self.documents.append(text)
49
+
50
+
51
 
52
  def load_documents(self):
53
  self.load()
 
92
  print("--------")
93
  print(chunks[-2])
94
  print("--------")
95
+ print(chunks[-1])
app.py CHANGED
@@ -11,6 +11,7 @@ from aimakerspace.openai_utils.embedding import EmbeddingModel
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
  import chainlit as cl
 
14
 
15
  system_template = """\
16
  Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
@@ -64,20 +65,37 @@ def process_text_file(file: AskFileResponse):
64
  texts = text_splitter.split_texts(documents)
65
  return texts
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  @cl.on_chat_start
69
  async def on_chat_start():
70
  files = None
71
 
72
  # Wait for the user to upload a file
73
- while files == None:
 
74
  files = await cl.AskFileMessage(
75
- content="Please upload a Text File file to begin!",
76
- accept=["text/plain"],
77
  max_size_mb=2,
78
  timeout=180,
79
  ).send()
80
 
 
81
  file = files[0]
82
 
83
  msg = cl.Message(
@@ -86,7 +104,10 @@ async def on_chat_start():
86
  await msg.send()
87
 
88
  # load the file
89
- texts = process_text_file(file)
 
 
 
90
 
91
  print(f"Processing {len(texts)} text chunks")
92
 
 
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
  import chainlit as cl
14
+ import fitz
15
 
16
  system_template = """\
17
  Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
 
65
  texts = text_splitter.split_texts(documents)
66
  return texts
67
 
68
+ def process_pdf_file(file: AskFileResponse):
69
+ import tempfile
70
+
71
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".pdf") as temp_file:
72
+ temp_file_path = temp_file.name
73
+
74
+ with open(temp_file_path, "wb") as f:
75
+ f.write(file.content)
76
+
77
+ text_loader = TextFileLoader(temp_file_path)
78
+ documents = text_loader.load_documents()
79
+ texts = text_splitter.split_texts(documents)
80
+ return texts
81
+
82
+
83
 
84
  @cl.on_chat_start
85
  async def on_chat_start():
86
  files = None
87
 
88
  # Wait for the user to upload a file
89
+ while files == None :
90
+
91
  files = await cl.AskFileMessage(
92
+ content="Please upload a Text or PDF file to begin!",
93
+ accept=["text/plain", "application/pdf"],
94
  max_size_mb=2,
95
  timeout=180,
96
  ).send()
97
 
98
+
99
  file = files[0]
100
 
101
  msg = cl.Message(
 
104
  await msg.send()
105
 
106
  # load the file
107
+ if file.name.endswith('.pdf'):
108
+ texts = process_pdf_file(file)
109
+ else:
110
+ texts = process_text_file(file)
111
 
112
  print(f"Processing {len(texts)} text chunks")
113