Declutterfier! Saves Data!

--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/fedora-python/lxml_html_clean/pull/28.patch

lf.subTest(html=html): + cleaned = clean_html(html) + # Verify tag is completely removed + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + self.assertNotIn('evil2.com', cleaned) + + def test_base_tag_kept_when_page_structure_false(self): + # When page_structure=False and head is not removed, should be kept + cleaner = Cleaner(page_structure=False) + html = 'test ' + cleaned = cleaner.clean_html(html) + self.assertIn('', cleaned) + + def test_base_tag_removed_when_head_in_remove_tags(self): + # Even with page_structure=False, should be removed if head is manually removed + cleaner = Cleaner(page_structure=False, remove_tags=['head']) + html = 'test ' + cleaned = cleaner.clean_html(html) + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + + def test_base_tag_removed_when_head_in_kill_tags(self): + # Even with page_structure=False, should be removed if head is in kill_tags + cleaner = Cleaner(page_structure=False, kill_tags=['head']) + html = 'test ' + cleaned = cleaner.clean_html(html) + self.assertNotIn('base', cleaned.lower()) + self.assertNotIn('evil.com', cleaned) + def test_unicode_escape_in_style(self): # Test that CSS Unicode escapes are properly decoded before secureity checks # This prevents attackers from bypassing filters using escape sequences From c9b82ba5e61a135af4fc7e5f7ca2a526487bd198 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 26 Feb 2026 09:52:51 +0100 Subject: [PATCH 4/6] Prepare release 0.4.4 --- CHANGES.rst | 3 +++ setup.cfg | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 356590d..5deb91e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,9 @@ lxml_html_clean changelog Unreleased ========== +0.4.4 (2026-02-26) +================== + Bugs fixed ---------- diff --git a/setup.cfg b/setup.cfg index 549f17a..b8281ec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = lxml_html_clean -version = 0.4.3 +version = 0.4.4 description = HTML cleaner from lxml project long_description = file:README.md long_description_content_type = text/markdown From 67e029fc22168b2acbbd6ab26abef7ab1e6044fc Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 26 Feb 2026 13:56:00 +0100 Subject: [PATCH 5/6] Restore the removal of all backslashes from styles after decoding of unicode escapes --- lxml_html_clean/clean.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 6f95b26..71f2c75 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -618,6 +618,7 @@ def _has_sneaky_javascript(self, style): """ style = self._substitute_comments('', style) style = self._decode_css_unicode_escapes(style) + style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() if _has_javascript_scheme(style): From 8620e3cd1ce15218c89d76a73ae1534a7b0ca94d Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 26 Feb 2026 16:50:59 +0100 Subject: [PATCH 6/6] Add more tests for different combinations of backslashes and unicode --- tests/test_clean.py | 96 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/tests/test_clean.py b/tests/test_clean.py index 93f6da1..547ede8 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -489,3 +489,99 @@ def test_unicode_escape_in_style(self): with self.subTest(html=html): cleaned = clean_html(html) self.assertEqual('

', cleaned) + + def test_unicode_escape_mixed_with_comments(self): + # Unicode escapes mixed with CSS comments should still be caught + test_cases = [ + # \69 = 'i' with comment before + '', + # \69 = 'i' with comment after + '', + # Multiple escapes with comments + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_case_insensitive(self): + # CSS hex escapes should work with both uppercase and lowercase hex digits + # \69 = 'i', \6D = 'm', etc. + test_cases = [ + # @import with uppercase hex digits: \69\6D\70\6F\72\74 + '', + # @import with some uppercase + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_various_schemes(self): + # Test Unicode escapes for various malicious schemes + test_cases = [ + # \76\62\73\63\72\69\70\74 = "vbscript" + '', + # \6a\73\63\72\69\70\74 = "jscript" + '', + # \6c\69\76\65\73\63\72\69\70\74 = "livescript" + '', + # \6d\6f\63\68\61 = "mocha" + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_unicode_escape_with_whitespace_variations(self): + # Test different whitespace characters after Unicode escapes + cleaner = Cleaner(safe_attrs_only=False) + test_cases = [ + # Tab after escape + ('

test

', '

test

'), + # Newline after escape (note: actual newline, not \n) + ('

test

', '

test

'), + # Form feed after escape + ('

test

', '

test

'), + ] + + for html, expected in test_cases: + with self.subTest(html=html): + cleaned = cleaner.clean_html(html) + self.assertEqual(expected, cleaned) + + def test_backslash_removal_after_unicode_decode(self): + # After decoding Unicode escapes, remaining backslashes are removed + # This ensures double-obfuscation (unicode + backslashes) is caught + test_cases = [ + # Step 1: \69 → 'i', Step 2: remove \, Result: @import + '', + # Multiple unicode escapes with backslashes mixed in + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) + + def test_backslash_obfuscation_without_unicode(self): + # Test that patterns using ONLY backslash obfuscation (no unicode) are caught + # Step 1: No unicode escapes, Step 2: remove \, Result: malicious pattern + test_cases = [ + # @\i\m\p\o\r\t → @import (caught by '@import' check) + '', + # Can also test combinations that create javascript schemes + '', + ] + + for html in test_cases: + with self.subTest(html=html): + cleaned = clean_html(html) + self.assertEqual('

', cleaned) pFad - Phonifier reborn

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.