TestHTMLSectionSplitterSecurity Class — langchain Architecture
Architecture documentation for the TestHTMLSectionSplitterSecurity class in test_html_security.py from the langchain codebase.
Entity Profile
Dependency Diagram
graph TD c36248a5_01d0_c9ad_b6b4_e15022cac62c["TestHTMLSectionSplitterSecurity"] 1db8e4d6_c273_3105_7553_9bf58752dc26["test_html_security.py"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|defined in| 1db8e4d6_c273_3105_7553_9bf58752dc26 0c4dbd4d_6e59_55d7_088c_a4dd38af6441["test_xxe_entity_attack_blocked()"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|method| 0c4dbd4d_6e59_55d7_088c_a4dd38af6441 d343734a_f94c_bc1c_a995_fcb524ba7232["test_xxe_document_function_blocked()"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|method| d343734a_f94c_bc1c_a995_fcb524ba7232 0956e94d_da18_da49_4fbb_8a571b7a690d["test_secure_parser_configuration()"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|method| 0956e94d_da18_da49_4fbb_8a571b7a690d d95c6f46_2e18_4f55_3c87_5dde79b306aa["test_no_network_access()"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|method| d95c6f46_2e18_4f55_3c87_5dde79b306aa 78bf3fbd_1a94_85cc_034d_d9f023580a27["test_dtd_processing_disabled()"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|method| 78bf3fbd_1a94_85cc_034d_d9f023580a27 24d38951_1094_cbb7_dcfa_9199504853d9["test_safe_default_xslt_usage()"] c36248a5_01d0_c9ad_b6b4_e15022cac62c -->|method| 24d38951_1094_cbb7_dcfa_9199504853d9
Relationship Graph
Source Code
libs/text-splitters/tests/unit_tests/test_html_security.py lines 9–130
class TestHTMLSectionSplitterSecurity:
"""Security tests for HTMLSectionSplitter to ensure XXE prevention."""
def test_xxe_entity_attack_blocked(self) -> None:
"""Test that external entity attacks are blocked."""
# Create HTML content to process
html_content = """<html><body><p>Test content</p></body></html>"""
# Since xslt_path parameter is removed, this attack vector is eliminated
# The splitter should use only the default XSLT
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML - should not contain any external entity content
result = splitter.split_text(html_content)
# Verify that no external entity content is present
all_content = " ".join([doc.page_content for doc in result])
assert "root:" not in all_content # /etc/passwd content
assert "XXE Attack Result" not in all_content
def test_xxe_document_function_blocked(self) -> None:
"""Test that XSLT document() function attacks are blocked."""
# Even if someone modifies the default XSLT internally,
# the secure parser configuration should block document() attacks
html_content = (
"""<html><body><h1>Test Header</h1><p>Test content</p></body></html>"""
)
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML safely
result = splitter.split_text(html_content)
# Should process normally without any security issues
assert len(result) > 0
assert any("Test content" in doc.page_content for doc in result)
def test_secure_parser_configuration(self) -> None:
"""Test that parsers are configured with security settings."""
# This test verifies our security hardening is in place
html_content = """<html><body><h1>Test</h1></body></html>"""
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# The convert_possible_tags_to_header method should use secure parsers
result = splitter.convert_possible_tags_to_header(html_content)
# Result should be valid transformed HTML
assert result is not None
assert isinstance(result, str)
def test_no_network_access(self) -> None:
"""Test that network access is blocked in parsers."""
# Create HTML that might trigger network access
html_with_external_ref = """<?xml version="1.0"?>
<!DOCTYPE html [
<!ENTITY external SYSTEM "http://attacker.com/xxe">
]>
<html>
<body>
<h1>Test</h1>
<p>&external;</p>
</body>
</html>"""
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML - should not make network requests
result = splitter.split_text(html_with_external_ref)
# Verify no external content is included
all_content = " ".join([doc.page_content for doc in result])
assert "attacker.com" not in all_content
def test_dtd_processing_disabled(self) -> None:
"""Test that DTD processing is disabled."""
# HTML with DTD that attempts to define entities
html_with_dtd = """<!DOCTYPE html [
<!ELEMENT html (body)>
<!ELEMENT body (h1, p)>
Source
Frequently Asked Questions
What is the TestHTMLSectionSplitterSecurity class?
TestHTMLSectionSplitterSecurity is a class in the langchain codebase, defined in libs/text-splitters/tests/unit_tests/test_html_security.py.
Where is TestHTMLSectionSplitterSecurity defined?
TestHTMLSectionSplitterSecurity is defined in libs/text-splitters/tests/unit_tests/test_html_security.py at line 9.
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free