Source code for langchain_community.document_transformers.beautiful_soup_transformer

from typing import Any, Iterator, List, Sequence, cast

from langchain_core.documents import BaseDocumentTransformer, Document


[docs]class BeautifulSoupTransformer(BaseDocumentTransformer): """Transform HTML content by extracting specific tags and removing unwanted ones. Example: .. code-block:: python from langchain_community.document_transformers import BeautifulSoupTransformer bs4_transformer = BeautifulSoupTransformer() docs_transformed = bs4_transformer.transform_documents(docs) """ # noqa: E501
[docs] def __init__(self) -> None: """ Initialize the transformer. This checks if the BeautifulSoup4 package is installed. If not, it raises an ImportError. """ try: import bs4 # noqa:F401 except ImportError: raise ImportError( "BeautifulSoup4 is required for BeautifulSoupTransformer. " "Please install it with `pip install beautifulsoup4`." )
[docs] def transform_documents( self, documents: Sequence[Document], unwanted_tags: List[str] = ["script", "style"], tags_to_extract: List[str] = ["p", "li", "div", "a"], remove_lines: bool = True, **kwargs: Any, ) -> Sequence[Document]: """ Transform a list of Document objects by cleaning their HTML content. Args: documents: A sequence of Document objects containing HTML content. unwanted_tags: A list of tags to be removed from the HTML. tags_to_extract: A list of tags whose content will be extracted. remove_lines: If set to True, unnecessary lines will be removed from the HTML content. Returns: A sequence of Document objects with transformed content. """ for doc in documents: cleaned_content = doc.page_content cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags) cleaned_content = self.extract_tags(cleaned_content, tags_to_extract) if remove_lines: cleaned_content = self.remove_unnecessary_lines(cleaned_content) doc.page_content = cleaned_content return documents
[docs] @staticmethod def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str: """ Remove unwanted tags from a given HTML content. Args: html_content: The original HTML content string. unwanted_tags: A list of tags to be removed from the HTML. Returns: A cleaned HTML string with unwanted tags removed. """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") for tag in unwanted_tags: for element in soup.find_all(tag): element.decompose() return str(soup)
[docs] @staticmethod def extract_tags(html_content: str, tags: List[str]) -> str: """ Extract specific tags from a given HTML content. Args: html_content: The original HTML content string. tags: A list of tags to be extracted from the HTML. Returns: A string combining the content of the extracted tags. """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") text_parts: List[str] = [] for element in soup.find_all(): if element.name in tags: # Extract all navigable strings recursively from this element. text_parts += get_navigable_strings(element) # To avoid duplicate text, remove all descendants from the soup. element.decompose() return " ".join(text_parts)
[docs] @staticmethod def remove_unnecessary_lines(content: str) -> str: """ Clean up the content by removing unnecessary lines. Args: content: A string, which may contain unnecessary lines or spaces. Returns: A cleaned string with unnecessary lines removed. """ lines = content.split("\n") stripped_lines = [line.strip() for line in lines] non_empty_lines = [line for line in stripped_lines if line] cleaned_content = " ".join(non_empty_lines) return cleaned_content
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any, ) -> Sequence[Document]: raise NotImplementedError
[docs]def get_navigable_strings(element: Any) -> Iterator[str]: """Get all navigable strings from a BeautifulSoup element. Args: element: A BeautifulSoup element. Returns: A generator of strings. """ from bs4 import NavigableString, Tag for child in cast(Tag, element).children: if isinstance(child, Tag): yield from get_navigable_strings(child) elif isinstance(child, NavigableString): if (element.name == "a") and (href := element.get("href")): yield f"{child.strip()} ({href})" else: yield child.strip()