Source code for langchain_text_splitters.spacy

from __future__ import annotations

from typing import Any, List

from langchain_text_splitters.base import TextSplitter


[docs]class SpacyTextSplitter(TextSplitter): """Splitting text using Spacy package. Per default, Spacy's `en_core_web_sm` model is used and its default max_length is 1000000 (it is the length of maximum character this model takes which can be increased for large files). For a faster, but potentially less accurate splitting, you can use `pipeline='sentencizer'`. """
[docs] def __init__( self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", max_length: int = 1_000_000, **kwargs: Any, ) -> None: """Initialize the spacy text splitter.""" super().__init__(**kwargs) self._tokenizer = _make_spacy_pipeline_for_splitting( pipeline, max_length=max_length ) self._separator = separator
[docs] def split_text(self, text: str) -> List[str]: """Split incoming text and return chunks.""" splits = (s.text for s in self._tokenizer(text).sents) return self._merge_splits(splits, self._separator)
def _make_spacy_pipeline_for_splitting( pipeline: str, *, max_length: int = 1_000_000 ) -> Any: # avoid importing spacy try: import spacy except ImportError: raise ImportError( "Spacy is not installed, please install it with `pip install spacy`." ) if pipeline == "sentencizer": from spacy.lang.en import English sentencizer: Any = English() sentencizer.add_pipe("sentencizer") else: sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"]) sentencizer.max_length = max_length return sentencizer