LangChain-AI应用开发框架(十)

张开发
2026/5/18 4:33:45 15 分钟阅读
LangChain-AI应用开发框架(十)
一.文本分割器Text splitters二.概念三.根据文档长度与文档语义拆分1.基于字符长度拆分from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_core.documents import Document from langchain_text_splitters import CharacterTextSplitter # single模式,只生成单个 loader UnstructuredMarkdownLoader( ../Docs/markdown/脚手架级微服务租房平台QA.md ) data loader.load() #文本分类器 text_splitter CharacterTextSplitter( separator\n\n, #分割符,可以设置分割符列表 chunk_size400, #块大小 chunk_overlap50, #块重叠大小 length_functionlen,#测量字符长度的函数 is_separator_regexFalse, #是否正则表达式描写分割符 ) #分割文档 documents text_splitter.split_documents(data) for document in documents[:10]: print(* * 30) print() print(document)链接地址:https://reference.langchain.com/python/langchain-text-splitters/character/CharacterTextSplitter2.基于Token长度拆分import tiktoken # 定于cl100k_base编码⽅式的分词器 enc tiktoken.get_encoding(cl100k_base) # 进⾏切分编码 enc_output enc.encode(my name is LiHua!) # 打印结果 print(f编码后的token{str(enc_output)}) for token in enc_output: print(f将token: {str(token)} 变成⽂本: {str(enc.decode_single_token_bytes(token))})from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_core.documents import Document from langchain_text_splitters import CharacterTextSplitter from zipp.glob import separate # single模式,只生成单个 loader UnstructuredMarkdownLoader( ../Docs/markdown/脚手架级微服务租房平台QA.md ) data loader.load() #tiktoken分词器 text_splitter CharacterTextSplitter.from_tiktoken_encoder( encoding_namecl100k_base, #tiktoken分词器中的一种编码方式 chunk_size400, #token大小 chunk_overlap50, #块重叠大小 ) #分割文档 documents text_splitter.split_documents(data) for document in documents[:10]: print(* * 30) print() print(document)3.硬性约束长度拆分from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_core.documents import Document from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter from zipp.glob import separate # single模式,只生成单个 loader UnstructuredMarkdownLoader( ../Docs/markdown/脚手架级微服务租房平台QA.md ) data loader.load() #强制按照约定的块大小进行分割 # text_splitter RecursiveCharacterTextSplitter.from_tiktoken_encoder( # encoding_namecl100k_base, #tiktoken分词器中的一种编码方式 # chunk_size100, #token大小 # chunk_overlap0, #块重叠大小 # ) text_splitter RecursiveCharacterTextSplitter.from_tiktoken_encoder( separator[\n\n,\n, ], #分割符,可以设置分割符列表 chunk_size400, #块大小 chunk_overlap50, #块重叠大小 length_functionlen,#测量字符长度的函数 is_separator_regexFalse, #是否正则表达式描写分割符 ) #分割文档 documents text_splitter.split_documents(data) for document in documents[:10]: print(* * 30) print() print(document) # import tiktoken # # 定于cl100k_base编码⽅式的分词器 # enc tiktoken.get_encoding(cl100k_base) # # 进⾏切分编码 # enc_output enc.encode(my name is LiHua!) # # 打印结果 # print(f编码后的token{str(enc_output)}) # for token in enc_output: # print(f将token: {str(token)} 变成⽂本:{str(enc.decode_single_token_bytes(token))})四.特殊文档结构拆分from langchain_text_splitters import PythonCodeTextSplitter # 字符串文档 PYTHON_CODE def hello_world(): print(Hello, World!) def hello_python(): print(Hello, Python!) python_splitter PythonCodeTextSplitter(chunk_size50, chunk_overlap0) python_docs python_splitter.create_documents([PYTHON_CODE]) for document in python_docs[:2]: print(* * 30) print(f{document}\n)

更多文章