mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-26 00:02:35 -04:00 
			
		
		
		
	add unittest for transform_inline_html
This commit is contained in:
		
							parent
							
								
									fda844f64c
								
							
						
					
					
						commit
						e384bd78c5
					
				| @ -296,22 +296,26 @@ class MailDocumentParser(DocumentParser): | |||||||
| 
 | 
 | ||||||
|         return response.content |         return response.content | ||||||
| 
 | 
 | ||||||
|     def transform_inline_html(self, orig_html, attachments): |     @staticmethod | ||||||
|  |     def transform_inline_html(html, attachments): | ||||||
|         def clean_html_script(text: str): |         def clean_html_script(text: str): | ||||||
|             text = text.replace("<script", "<div hidden ") |             compiled_open = re.compile(re.escape("<script"), re.IGNORECASE) | ||||||
|             text = text.replace("</script", "</div") |             text = compiled_open.sub("<div hidden ", text) | ||||||
|  | 
 | ||||||
|  |             compiled_close = re.compile(re.escape("</script"), re.IGNORECASE) | ||||||
|  |             text = compiled_close.sub("</div", text) | ||||||
|             return text |             return text | ||||||
| 
 | 
 | ||||||
|         orig_html = clean_html_script(orig_html) |         html_clean = clean_html_script(html) | ||||||
|         files = [] |         files = [] | ||||||
| 
 | 
 | ||||||
|         for a in attachments: |         for a in attachments: | ||||||
|             name_cid = "cid:" + a.content_id |             name_cid = "cid:" + a.content_id | ||||||
|             name_clean = "".join(e for e in name_cid if e.isalnum()) |             name_clean = "".join(e for e in name_cid if e.isalnum()) | ||||||
|             files.append((name_clean, BytesIO(a.payload))) |             files.append((name_clean, BytesIO(a.payload))) | ||||||
|             orig_html = orig_html.replace(name_cid, name_clean) |             html_clean = html_clean.replace(name_cid, name_clean) | ||||||
| 
 | 
 | ||||||
|         files.append(("index.html", StringIO(orig_html))) |         files.append(("index.html", StringIO(html_clean))) | ||||||
| 
 | 
 | ||||||
|         return files |         return files | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										15
									
								
								src/paperless_mail/tests/samples/sample.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								src/paperless_mail/tests/samples/sample.html
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | |||||||
|  |  <html> | ||||||
|  |     <head> | ||||||
|  |       <meta http-equiv="content-type" content="text/html; charset=UTF-8"> | ||||||
|  |     </head> | ||||||
|  |     <body> | ||||||
|  |       <p>Some Text</p> | ||||||
|  |       <p><img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt=""></p> | ||||||
|  |       <p>and an embedded image.<br> | ||||||
|  |       </p> | ||||||
|  |       <p id="changeme">Paragraph unchanged.</p> | ||||||
|  |       <scRipt> | ||||||
|  |           document.getElementById("changeme").innerHTML = "Paragraph changed via Java Script."; | ||||||
|  |       </script> | ||||||
|  |     </body> | ||||||
|  |   </html> | ||||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_mail/tests/samples/sample.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_mail/tests/samples/sample.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 6.9 KiB | 
| @ -282,3 +282,27 @@ class TestParser(TestCase): | |||||||
|         # Check successful parsing |         # Check successful parsing | ||||||
|         parsed = parser.tika_parse(html) |         parsed = parser.tika_parse(html) | ||||||
|         self.assertEqual(expected_text, parsed) |         self.assertEqual(expected_text, parsed) | ||||||
|  | 
 | ||||||
|  |     def test_transform_inline_html(self): | ||||||
|  |         class MailAttachmentMock: | ||||||
|  |             def __init__(self, payload, content_id): | ||||||
|  |                 self.payload = payload | ||||||
|  |                 self.content_id = content_id | ||||||
|  | 
 | ||||||
|  |         parser = MailDocumentParser(None) | ||||||
|  | 
 | ||||||
|  |         result = None | ||||||
|  | 
 | ||||||
|  |         with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file: | ||||||
|  |             with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file: | ||||||
|  |                 html = html_file.read() | ||||||
|  |                 png = png_file.read() | ||||||
|  |                 attachments = [ | ||||||
|  |                     MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"), | ||||||
|  |                 ] | ||||||
|  |                 result = parser.transform_inline_html(html, attachments) | ||||||
|  | 
 | ||||||
|  |         resulting_html = result[-1][1].read() | ||||||
|  |         self.assertTrue(result[-1][0] == "index.html") | ||||||
|  |         self.assertTrue(result[0][0] in resulting_html) | ||||||
|  |         self.assertFalse("<script" in resulting_html.lower()) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user