extract_sub_links() — langchain Function Reference

Architecture documentation for the extract_sub_links() function in html.py from the langchain codebase.

Function python Observability TelemetryUtilities calls 1

Entity Profile

Observability→ TelemetryUtilities→ extract_sub_links() — langchain Function Reference

Dependency Diagram

graph TD
  71aa6dfa_b9f9_971f_66bf_e7e11e97ead6["extract_sub_links()"]
  a747c9e4_9d35_6376_2a05_6e094efb6182["html.py"]
  71aa6dfa_b9f9_971f_66bf_e7e11e97ead6 -->|defined in| a747c9e4_9d35_6376_2a05_6e094efb6182
  8806546c_d4c5_b816_1799_3f83f54ae6a2["find_all_links()"]
  71aa6dfa_b9f9_971f_66bf_e7e11e97ead6 -->|calls| 8806546c_d4c5_b816_1799_3f83f54ae6a2
  style 71aa6dfa_b9f9_971f_66bf_e7e11e97ead6 fill:#6366f1,stroke:#818cf8,color:#fff

Relationship Graph

Source Code

libs/core/langchain_core/utils/html.py lines 62–132

def extract_sub_links(
    raw_html: str,
    url: str,
    *,
    base_url: str | None = None,
    pattern: str | re.Pattern | None = None,
    prevent_outside: bool = True,
    exclude_prefixes: Sequence[str] = (),
    continue_on_failure: bool = False,
) -> list[str]:
    """Extract all links from a raw HTML string and convert into absolute paths.

    Args:
        raw_html: Original HTML.
        url: The url of the HTML.
        base_url: the base URL to check for outside links against.
        pattern: Regex to use for extracting links from raw HTML.
        prevent_outside: If `True`, ignore external links which are not children
            of the base URL.
        exclude_prefixes: Exclude any URLs that start with one of these prefixes.
        continue_on_failure: If `True`, continue if parsing a specific link raises an
            exception. Otherwise, raise the exception.

    Returns:
        A list of absolute paths to sub links.
    """
    base_url_to_use = base_url if base_url is not None else url
    parsed_base_url = urlparse(base_url_to_use)
    parsed_url = urlparse(url)
    all_links = find_all_links(raw_html, pattern=pattern)
    absolute_paths = set()
    for link in all_links:
        try:
            parsed_link = urlparse(link)
            # Some may be absolute links like https://to/path
            if parsed_link.scheme in {"http", "https"}:
                absolute_path = link
            # Some may have omitted the protocol like //to/path
            elif link.startswith("//"):
                absolute_path = f"{parsed_url.scheme}:{link}"
            else:
                absolute_path = urljoin(url, parsed_link.path)
                if parsed_link.query:
                    absolute_path += f"?{parsed_link.query}"
            absolute_paths.add(absolute_path)
        except Exception as e:
            if continue_on_failure:
                logger.warning(
                    "Unable to load link %s. Raised exception:\n\n%s", link, e
                )
                continue
            raise

    results = []
    for path in absolute_paths:
        if any(path.startswith(exclude_prefix) for exclude_prefix in exclude_prefixes):
            continue

        if prevent_outside:
            parsed_path = urlparse(path)

            if parsed_base_url.netloc != parsed_path.netloc:
                continue

            # Will take care of verifying rest of path after netloc
            # if it's more specific
            if not path.startswith(base_url_to_use):
                continue

        results.append(path)
    return results