ibraheem007 commited on
Commit
20703d2
·
verified ·
1 Parent(s): 31a0760

Create utils/pdf_export.py

Browse files
Files changed (1) hide show
  1. utils/pdf_export.py +391 -0
utils/pdf_export.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from reportlab.lib.pagesizes import letter, A4
2
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
3
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
4
+ from reportlab.lib.units import inch
5
+ from reportlab.lib import colors
6
+ from reportlab.pdfbase import pdfmetrics
7
+ from reportlab.pdfbase.ttfonts import TTFont
8
+ import re
9
+ from datetime import datetime
10
+
11
+ def export_content_to_pdf(content, title, student_level, content_type=None, objectives=None):
12
+ """
13
+ Export generated content to a well-formatted PDF
14
+ """
15
+ import io
16
+ buffer = io.BytesIO()
17
+
18
+ try:
19
+ print(f"🔧 Starting PDF export: title='{title}', content_length={len(content)}")
20
+
21
+ doc = SimpleDocTemplate(
22
+ buffer,
23
+ pagesize=letter,
24
+ rightMargin=72,
25
+ leftMargin=72,
26
+ topMargin=72,
27
+ bottomMargin=72
28
+ )
29
+
30
+ styles = getSampleStyleSheet()
31
+
32
+ # Custom styles
33
+ title_style = ParagraphStyle(
34
+ 'CustomTitle',
35
+ parent=styles['Heading1'],
36
+ fontSize=18,
37
+ spaceAfter=30,
38
+ textColor=colors.darkblue,
39
+ alignment=1
40
+ )
41
+
42
+ heading_style = ParagraphStyle(
43
+ 'CustomHeading',
44
+ parent=styles['Heading2'],
45
+ fontSize=14,
46
+ spaceAfter=12,
47
+ textColor=colors.darkblue,
48
+ spaceBefore=20
49
+ )
50
+
51
+ subheading_style = ParagraphStyle(
52
+ 'CustomSubheading',
53
+ parent=styles['Heading3'],
54
+ fontSize=12,
55
+ spaceAfter=6,
56
+ textColor=colors.darkblue,
57
+ spaceBefore=15
58
+ )
59
+
60
+ normal_style = ParagraphStyle(
61
+ 'CustomNormal',
62
+ parent=styles['Normal'],
63
+ fontSize=11,
64
+ spaceAfter=12,
65
+ leading=14
66
+ )
67
+
68
+ metadata_style = ParagraphStyle(
69
+ 'Metadata',
70
+ parent=styles['Normal'],
71
+ fontSize=10,
72
+ textColor=colors.gray,
73
+ spaceAfter=6
74
+ )
75
+
76
+ # Build story
77
+ story = []
78
+
79
+ # Title
80
+ story.append(Paragraph(title, title_style))
81
+ story.append(Spacer(1, 20))
82
+
83
+ # Metadata
84
+ story.append(Paragraph(f"<b>Target Level:</b> {student_level}", metadata_style))
85
+ if content_type:
86
+ story.append(Paragraph(f"<b>Content Type:</b> {content_type}", metadata_style))
87
+ story.append(Paragraph(f"<b>Generated on:</b> {datetime.now().strftime('%Y-%m-%d %H:%M')}", metadata_style))
88
+ story.append(Paragraph("<b>Source:</b> TailorED AI Educational Assistant", metadata_style))
89
+
90
+ story.append(Spacer(1, 30))
91
+
92
+ # Learning objectives
93
+ if objectives:
94
+ story.append(Paragraph("Learning Objectives", heading_style))
95
+ objectives_clean = clean_html_content(objectives)
96
+ story.append(Paragraph(objectives_clean, normal_style))
97
+ story.append(Spacer(1, 20))
98
+
99
+ # Main content
100
+ story.append(Paragraph("Content", heading_style))
101
+
102
+ # Process content with proper formatting
103
+ content_paragraphs = clean_and_split_content(content)
104
+ print(f"🔧 Processed {len(content_paragraphs)} content paragraphs")
105
+
106
+ for i, paragraph in enumerate(content_paragraphs):
107
+ try:
108
+ if is_markdown_table(paragraph):
109
+ # Handle markdown tables
110
+ try:
111
+ table_data = parse_markdown_table(paragraph)
112
+ if table_data:
113
+ table = Table(table_data)
114
+ table.setStyle(TableStyle([
115
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
116
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
117
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
118
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
119
+ ('FONTSIZE', (0, 0), (-1, 0), 12),
120
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
121
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
122
+ ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
123
+ ('FONTSIZE', (0, 1), (-1, -1), 10),
124
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
125
+ ]))
126
+ story.append(table)
127
+ story.append(Spacer(1, 20))
128
+ except Exception as e:
129
+ # If table parsing fails, fall back to plain text
130
+ print(f"⚠️ Table parsing failed for paragraph {i}, using plain text: {e}")
131
+ clean_para = clean_html_content(paragraph)
132
+ story.append(Paragraph(clean_para, normal_style))
133
+ story.append(Spacer(1, 8))
134
+
135
+ elif is_heading(paragraph):
136
+ clean_para = clean_html_content(paragraph)
137
+ story.append(Paragraph(clean_para, subheading_style))
138
+ story.append(Spacer(1, 8))
139
+ else:
140
+ clean_para = clean_html_content(paragraph)
141
+ formatted_para = format_bullet_points(clean_para)
142
+ story.append(Paragraph(formatted_para, normal_style))
143
+ story.append(Spacer(1, 8))
144
+ except Exception as para_error:
145
+ print(f"⚠️ Error processing paragraph {i}: {para_error}")
146
+ # Add a simple paragraph as fallback
147
+ story.append(Paragraph(f"Content section {i+1}", normal_style))
148
+ story.append(Spacer(1, 8))
149
+
150
+ # Footer note
151
+ story.append(Spacer(1, 30))
152
+ story.append(Paragraph(
153
+ "<i>This content was AI-generated and should be reviewed for accuracy before use in formal educational settings.</i>",
154
+ metadata_style
155
+ ))
156
+
157
+ # Build PDF
158
+ print("🔧 Building PDF document...")
159
+ doc.build(story)
160
+
161
+ pdf_data = buffer.getvalue()
162
+ buffer.close()
163
+
164
+ # Basic validation
165
+ if len(pdf_data) < 100:
166
+ raise ValueError(f"Generated PDF is too small ({len(pdf_data)} bytes), likely empty")
167
+
168
+ print(f"✅ PDF export successful: {len(pdf_data)} bytes")
169
+ return pdf_data
170
+
171
+ except Exception as e:
172
+ print(f"❌ PDF export failed: {e}")
173
+ try:
174
+ buffer.close()
175
+ except:
176
+ pass
177
+
178
+ # Return a simple error PDF as fallback
179
+ return create_error_pdf(str(e))
180
+
181
+ def create_error_pdf(error_message):
182
+ """Create a simple PDF with error message"""
183
+ import io
184
+ buffer = io.BytesIO()
185
+
186
+ try:
187
+ doc = SimpleDocTemplate(
188
+ buffer,
189
+ pagesize=letter,
190
+ rightMargin=72,
191
+ leftMargin=72,
192
+ topMargin=72,
193
+ bottomMargin=72
194
+ )
195
+
196
+ styles = getSampleStyleSheet()
197
+
198
+ story = []
199
+ story.append(Paragraph("PDF Generation Error", styles['Heading1']))
200
+ story.append(Spacer(1, 20))
201
+ story.append(Paragraph("There was an error generating the PDF:", styles['Normal']))
202
+ story.append(Spacer(1, 10))
203
+ story.append(Paragraph(f"<i>{error_message}</i>", styles['Normal']))
204
+ story.append(Spacer(1, 20))
205
+ story.append(Paragraph("Please try regenerating the content or contact support.", styles['Normal']))
206
+
207
+ doc.build(story)
208
+ pdf_data = buffer.getvalue()
209
+ buffer.close()
210
+
211
+ return pdf_data
212
+ except Exception as fallback_error:
213
+ print(f"❌ Even error PDF failed: {fallback_error}")
214
+ return None
215
+
216
+ def clean_html_content(text):
217
+ """Clean HTML tags and markdown syntax from text"""
218
+ if not text:
219
+ return ""
220
+
221
+ try:
222
+ # Remove HTML tags but keep the content
223
+ text = re.sub(r'<[^>]+>', '', text)
224
+
225
+ # Clean up markdown table syntax
226
+ text = re.sub(r'\|+\s*', ' ', text) # Remove table pipes but keep spaces
227
+ text = re.sub(r'-+\s*', '', text) # Remove table separator lines
228
+
229
+ # Clean up other markdown
230
+ text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text) # Bold
231
+ text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text) # Italic
232
+ text = re.sub(r'`(.*?)`', r'<font face="Courier">\1</font>', text) # Code
233
+
234
+ # Handle line breaks properly
235
+ text = re.sub(r'<br/>', '<br/>', text)
236
+ text = re.sub(r'\n', '<br/>', text)
237
+
238
+ # Clean up extra spaces
239
+ text = re.sub(r' +', ' ', text)
240
+ text = re.sub(r'<br/>\s*<br/>', '<br/><br/>', text)
241
+
242
+ return text.strip()
243
+ except Exception as e:
244
+ print(f"⚠️ Error cleaning HTML content: {e}")
245
+ return str(text)[:500] # Return truncated text as fallback
246
+
247
+ def clean_and_split_content(content):
248
+ """Split content into paragraphs and clean formatting"""
249
+ try:
250
+ # First, extract and handle tables separately
251
+ tables = extract_markdown_tables(content)
252
+
253
+ # Remove tables from content for paragraph processing
254
+ content_without_tables = remove_markdown_tables(content)
255
+
256
+ # Split into paragraphs
257
+ paragraphs = re.split(r'\n\s*\n', content_without_tables)
258
+
259
+ # Clean each paragraph and interleave with tables
260
+ cleaned_paragraphs = []
261
+ current_pos = 0
262
+
263
+ for para in paragraphs:
264
+ para = para.strip()
265
+ if para and len(para) > 1:
266
+ # Check if there's a table at this position in original content
267
+ for table_start, table_end, table_content in tables:
268
+ if current_pos <= table_start < current_pos + len(para):
269
+ cleaned_paragraphs.append(table_content)
270
+
271
+ cleaned_para = para
272
+ cleaned_paragraphs.append(cleaned_para)
273
+ current_pos += len(para) + 2 # +2 for the newlines
274
+
275
+ return cleaned_paragraphs
276
+ except Exception as e:
277
+ print(f"⚠️ Error cleaning and splitting content: {e}")
278
+ # Fallback: simple split by paragraphs
279
+ return [p.strip() for p in content.split('\n\n') if p.strip()]
280
+
281
+ def extract_markdown_tables(content):
282
+ """Extract markdown tables from content"""
283
+ try:
284
+ tables = []
285
+ table_pattern = r'(\|.*\|[\r\n]+\|[\s\-|]*[\r\n]+(?:\|.*\|[\r\n]*)+)'
286
+
287
+ for match in re.finditer(table_pattern, content):
288
+ table_content = match.group(1)
289
+ tables.append((match.start(), match.end(), table_content))
290
+
291
+ return tables
292
+ except Exception as e:
293
+ print(f"⚠️ Error extracting markdown tables: {e}")
294
+ return []
295
+
296
+ def remove_markdown_tables(content):
297
+ """Remove markdown tables from content"""
298
+ try:
299
+ table_pattern = r'(\|.*\|[\r\n]+\|[\s\-|]*[\r\n]+(?:\|.*\|[\r\n]*)+)'
300
+ return re.sub(table_pattern, '', content)
301
+ except Exception as e:
302
+ print(f"⚠️ Error removing markdown tables: {e}")
303
+ return content
304
+
305
+ def is_markdown_table(text):
306
+ """Check if text is a markdown table"""
307
+ try:
308
+ lines = text.strip().split('\n')
309
+ if len(lines) < 2:
310
+ return False
311
+
312
+ # Check if it has table structure
313
+ has_pipes = all('|' in line for line in lines[:2])
314
+ has_separator = '---' in lines[1] or '===' in lines[1] if len(lines) > 1 else False
315
+
316
+ return has_pipes and (has_separator or len(lines) >= 2)
317
+ except Exception as e:
318
+ print(f"⚠️ Error checking markdown table: {e}")
319
+ return False
320
+
321
+ def parse_markdown_table(table_text):
322
+ """Parse markdown table into 2D array for ReportLab Table"""
323
+ try:
324
+ lines = [line.strip() for line in table_text.split('\n') if line.strip()]
325
+
326
+ if len(lines) < 2:
327
+ return None
328
+
329
+ # Remove separator line if present
330
+ if '---' in lines[1] or '===' in lines[1]:
331
+ lines.pop(1)
332
+
333
+ table_data = []
334
+ for line in lines:
335
+ if line.startswith('|'):
336
+ line = line[1:]
337
+ if line.endswith('|'):
338
+ line = line[:-1]
339
+
340
+ cells = [cell.strip() for cell in line.split('|')]
341
+ table_data.append(cells)
342
+
343
+ return table_data
344
+ except Exception as e:
345
+ print(f"⚠️ Error parsing markdown table: {e}")
346
+ return None
347
+
348
+ def is_heading(text):
349
+ """Check if text appears to be a heading"""
350
+ try:
351
+ if len(text) < 100 and (text.endswith(':') or text.isupper() or looks_like_heading(text)):
352
+ return True
353
+ return False
354
+ except Exception as e:
355
+ print(f"⚠️ Error checking heading: {e}")
356
+ return False
357
+
358
+ def looks_like_heading(text):
359
+ """Heuristic to detect heading-like text"""
360
+ try:
361
+ heading_indicators = [
362
+ 'introduction', 'overview', 'key concepts', 'summary', 'conclusion',
363
+ 'examples', 'applications', 'definition', 'theory', 'practice',
364
+ 'problem', 'solution', 'advantages', 'disadvantages', 'steps'
365
+ ]
366
+
367
+ text_lower = text.lower()
368
+ return any(indicator in text_lower for indicator in heading_indicators)
369
+ except Exception as e:
370
+ print(f"⚠️ Error in looks_like_heading: {e}")
371
+ return False
372
+
373
+ def format_bullet_points(text):
374
+ """Format bullet points and lists for PDF"""
375
+ try:
376
+ lines = text.split('\n')
377
+ formatted_lines = []
378
+
379
+ for line in lines:
380
+ line = line.strip()
381
+ if line.startswith(('•', '-', '*')):
382
+ formatted_lines.append(f"&bull; {line[1:].strip()}")
383
+ elif line and line[0].isdigit() and '. ' in line[:5]:
384
+ formatted_lines.append(line)
385
+ else:
386
+ formatted_lines.append(line)
387
+
388
+ return '<br/>'.join(formatted_lines)
389
+ except Exception as e:
390
+ print(f"⚠️ Error formatting bullet points: {e}")
391
+ return text