feat(document): 实现多格式文档上传与解析功能
- 移除 AiChatController 中的 PDF 读取相关逻辑与依赖- 新增 DocumentController 支持文件上传接口 - 新增 DocumentIngestionService 接口及实现,负责文档处理流程 - 抽象 DocumentParser 接口统一各类文档解析器行为 - 重构所有具体文档读取器(PDF、HTML、JSON 等)实现新的解析接口- 引入 MultipartFileResource 工具类以适配 Spring AI 读取器 - 添加 DocumentUploadResponse 响应模型类 - 各文档读取器增加对文件扩展名和 MIME 类型的支持判断
This commit is contained in:
@@ -2,20 +2,15 @@ package com.hanserwei.chat.controller;
|
|||||||
|
|
||||||
import com.hanserwei.chat.model.dto.ChatMessageDTO;
|
import com.hanserwei.chat.model.dto.ChatMessageDTO;
|
||||||
import com.hanserwei.chat.model.vo.AIResponse;
|
import com.hanserwei.chat.model.vo.AIResponse;
|
||||||
import com.hanserwei.chat.reader.MyPdfReader;
|
|
||||||
import com.hanserwei.chat.utils.ConversationContext;
|
import com.hanserwei.chat.utils.ConversationContext;
|
||||||
import jakarta.annotation.Resource;
|
import jakarta.annotation.Resource;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.ai.chat.client.ChatClient;
|
import org.springframework.ai.chat.client.ChatClient;
|
||||||
import org.springframework.ai.chat.memory.ChatMemory;
|
import org.springframework.ai.chat.memory.ChatMemory;
|
||||||
import org.springframework.ai.document.Document;
|
|
||||||
import org.springframework.ai.vectorstore.VectorStore;
|
|
||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.*;
|
||||||
import reactor.core.publisher.Flux;
|
import reactor.core.publisher.Flux;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@RestController
|
@RestController
|
||||||
@RequestMapping("/ai")
|
@RequestMapping("/ai")
|
||||||
@@ -23,10 +18,6 @@ public class AiChatController {
|
|||||||
|
|
||||||
@Resource
|
@Resource
|
||||||
private ChatClient dashScopeChatClient;
|
private ChatClient dashScopeChatClient;
|
||||||
@Resource
|
|
||||||
private MyPdfReader myPdfReader;
|
|
||||||
@Resource
|
|
||||||
private VectorStore vectorStore;
|
|
||||||
|
|
||||||
@PostMapping(path = "/chat", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
|
@PostMapping(path = "/chat", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
|
||||||
public Flux<AIResponse> chatWithAi(@RequestBody ChatMessageDTO chatMessageDTO) {
|
public Flux<AIResponse> chatWithAi(@RequestBody ChatMessageDTO chatMessageDTO) {
|
||||||
@@ -44,11 +35,4 @@ public class AiChatController {
|
|||||||
.contextWrite(ctx -> ConversationContext.withConversationId(chatMessageDTO.getConversionId()))
|
.contextWrite(ctx -> ConversationContext.withConversationId(chatMessageDTO.getConversionId()))
|
||||||
.doFinally(signalType -> ConversationContext.clear());
|
.doFinally(signalType -> ConversationContext.clear());
|
||||||
}
|
}
|
||||||
|
|
||||||
@GetMapping("/readpdf")
|
|
||||||
public String readPdf() {
|
|
||||||
List<Document> docsFromPdf = myPdfReader.getDocsFromPdf();
|
|
||||||
vectorStore.add(docsFromPdf);
|
|
||||||
return "ok!";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
package com.hanserwei.chat.controller;
|
||||||
|
|
||||||
|
import com.hanserwei.chat.model.vo.DocumentUploadResponse;
|
||||||
|
import com.hanserwei.chat.service.DocumentIngestionService;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
|
import org.springframework.http.ResponseEntity;
|
||||||
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/documents")
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocumentController {
|
||||||
|
|
||||||
|
private final DocumentIngestionService documentIngestionService;
|
||||||
|
|
||||||
|
@PostMapping(value = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
|
||||||
|
public ResponseEntity<DocumentUploadResponse> upload(@RequestParam("file") MultipartFile file) {
|
||||||
|
int documentCount = documentIngestionService.ingest(file);
|
||||||
|
log.info("文件 {} 上传成功。", file.getOriginalFilename());
|
||||||
|
return ResponseEntity.ok(new DocumentUploadResponse(file.getOriginalFilename(), documentCount, "ok"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
package com.hanserwei.chat.model.vo;
|
||||||
|
|
||||||
|
public record DocumentUploadResponse(String filename, int documentCount, String message) {
|
||||||
|
}
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
package com.hanserwei.chat.reader;
|
||||||
|
|
||||||
|
import org.springframework.ai.document.Document;
|
||||||
|
import org.springframework.lang.Nullable;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将上传文件转换为Spring AI {@link Document}实例的策略接口。
|
||||||
|
*/
|
||||||
|
public interface DocumentParser {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断此解析器是否可以处理提供的文件。
|
||||||
|
*
|
||||||
|
* @param filename 客户端提供的原始文件名(可能为{@code null})
|
||||||
|
* @param contentType 从上传中派生的MIME类型(可能为{@code null})
|
||||||
|
* @return 如果此解析器应该处理该文件则返回{@code true}
|
||||||
|
*/
|
||||||
|
boolean supports(@Nullable String filename, @Nullable String contentType);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将文件解析为Spring AI文档。
|
||||||
|
*
|
||||||
|
* @param file 多部分上传载荷
|
||||||
|
* @return 解析后的文档列表
|
||||||
|
*/
|
||||||
|
List<Document> parse(MultipartFile file);
|
||||||
|
|
||||||
|
default boolean hasExtension(@Nullable String filename, String... extensions) {
|
||||||
|
if (filename == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int index = filename.lastIndexOf('.');
|
||||||
|
if (index < 0 || index == filename.length() - 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String actualExtension = filename.substring(index + 1);
|
||||||
|
for (String extension : extensions) {
|
||||||
|
if (actualExtension.equalsIgnoreCase(extension)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
default boolean matchesContentType(@Nullable String contentType, String... supportedContentTypes) {
|
||||||
|
if (contentType == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (String supportedContentType : supportedContentTypes) {
|
||||||
|
if (contentType.equalsIgnoreCase(supportedContentType)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
package com.hanserwei.chat.reader;
|
||||||
|
|
||||||
|
import org.springframework.core.io.ByteArrayResource;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.UncheckedIOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 简单的 {@link ByteArrayResource},用于保留多部分文件的文件名。
|
||||||
|
*/
|
||||||
|
final class MultipartFileResource extends ByteArrayResource {
|
||||||
|
|
||||||
|
private final String filename;
|
||||||
|
|
||||||
|
private MultipartFileResource(byte[] byteArray, String filename) {
|
||||||
|
super(byteArray);
|
||||||
|
this.filename = filename;
|
||||||
|
}
|
||||||
|
|
||||||
|
static MultipartFileResource of(MultipartFile file) {
|
||||||
|
try {
|
||||||
|
String originalFilename = Objects.requireNonNullElse(file.getOriginalFilename(), "upload");
|
||||||
|
return new MultipartFileResource(file.getBytes(), originalFilename);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new UncheckedIOException("读取多部分文件内容失败", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getFilename() {
|
||||||
|
return filename;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,32 +3,35 @@ package com.hanserwei.chat.reader;
|
|||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.reader.jsoup.JsoupDocumentReader;
|
import org.springframework.ai.reader.jsoup.JsoupDocumentReader;
|
||||||
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
|
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyHtmlReader {
|
public class MyHtmlReader implements DocumentParser {
|
||||||
|
|
||||||
@Value("classpath:/document/my-page.html")
|
@Override
|
||||||
private Resource resource;
|
public List<Document> parse(MultipartFile file) {
|
||||||
|
|
||||||
public List<Document> loadHtml() {
|
|
||||||
// JsoupDocumentReader 阅读器配置类
|
// JsoupDocumentReader 阅读器配置类
|
||||||
JsoupDocumentReaderConfig config = JsoupDocumentReaderConfig.builder()
|
JsoupDocumentReaderConfig config = JsoupDocumentReaderConfig.builder()
|
||||||
.selector("article p") // 提取 <article> 标签内的 p 段落
|
.selector("article p") // 提取 <article> 标签内的 p 段落
|
||||||
.charset("UTF-8") // 使用 UTF-8 编码
|
.charset("UTF-8") // 使用 UTF-8 编码
|
||||||
.includeLinkUrls(true) // 在元数据中包含链接 URL(绝对链接)
|
.includeLinkUrls(true) // 在元数据中包含链接 URL(绝对链接)
|
||||||
.metadataTags(List.of("author", "date")) // 提取 author 和 date 元标签
|
.metadataTags(List.of("author", "date")) // 提取 author 和 date 元标签
|
||||||
.additionalMetadata("source", "my-page.html") // 添加自定义元数据
|
.additionalMetadata("source", file.getOriginalFilename()) // 添加自定义元数据
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
// 新建 JsoupDocumentReader 阅读器
|
// 新建 JsoupDocumentReader 阅读器
|
||||||
JsoupDocumentReader reader = new JsoupDocumentReader(resource, config);
|
JsoupDocumentReader reader = new JsoupDocumentReader(MultipartFileResource.of(file), config);
|
||||||
|
|
||||||
// 读取并转换为 Document 文档集合
|
// 读取并转换为 Document 文档集合
|
||||||
return reader.get();
|
return reader.get();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(String filename, String contentType) {
|
||||||
|
return hasExtension(filename, "html", "htm") ||
|
||||||
|
matchesContentType(contentType, "text/html", "application/xhtml+xml");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,26 +2,28 @@ package com.hanserwei.chat.reader;
|
|||||||
|
|
||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.reader.JsonReader;
|
import org.springframework.ai.reader.JsonReader;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyJsonReader {
|
public class MyJsonReader implements DocumentParser {
|
||||||
|
|
||||||
@Value("classpath:/document/tv.json")
|
|
||||||
private Resource resource;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 读取 Json 文件
|
* 读取 Json 文件
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public List<Document> loadJson() {
|
@Override
|
||||||
|
public List<Document> parse(MultipartFile file) {
|
||||||
// 创建 JsonReader 阅读器实例,配置需要读取的字段
|
// 创建 JsonReader 阅读器实例,配置需要读取的字段
|
||||||
JsonReader jsonReader = new JsonReader(resource, "description", "content", "title");
|
JsonReader jsonReader = new JsonReader(MultipartFileResource.of(file), "description", "content", "title");
|
||||||
// 执行读取操作,并转换为 Document 对象集合
|
// 执行读取操作,并转换为 Document 对象集合
|
||||||
return jsonReader.get();
|
return jsonReader.get();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(String filename, String contentType) {
|
||||||
|
return hasExtension(filename, "json") || matchesContentType(contentType, "application/json");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,31 +3,34 @@ package com.hanserwei.chat.reader;
|
|||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
|
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
|
||||||
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
|
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyMarkdownReader {
|
public class MyMarkdownReader implements DocumentParser {
|
||||||
|
|
||||||
@Value("classpath:/document/code.md")
|
@Override
|
||||||
private Resource resource;
|
public List<Document> parse(MultipartFile file) {
|
||||||
|
|
||||||
public List<Document> loadMarkdown() {
|
|
||||||
// MarkdownDocumentReader 阅读器配置类
|
// MarkdownDocumentReader 阅读器配置类
|
||||||
MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
|
MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
|
||||||
.withHorizontalRuleCreateDocument(true) // 遇到水平线 ---,则创建新文档
|
.withHorizontalRuleCreateDocument(true) // 遇到水平线 ---,则创建新文档
|
||||||
.withIncludeCodeBlock(false) // 排除代码块(代码块生成单独文档)
|
.withIncludeCodeBlock(false) // 排除代码块(代码块生成单独文档)
|
||||||
.withIncludeBlockquote(false) // 排除块引用(块引用生成单独文档)
|
.withIncludeBlockquote(false) // 排除块引用(块引用生成单独文档)
|
||||||
.withAdditionalMetadata("filename", "code.md") // 添加自定义元数据,如文件名称
|
.withAdditionalMetadata("filename", file.getOriginalFilename()) // 添加自定义元数据,如文件名称
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
// 新建 MarkdownDocumentReader 阅读器
|
// 新建 MarkdownDocumentReader 阅读器
|
||||||
MarkdownDocumentReader reader = new MarkdownDocumentReader(resource, config);
|
MarkdownDocumentReader reader = new MarkdownDocumentReader(MultipartFileResource.of(file), config);
|
||||||
|
|
||||||
// 读取并转换为 Document 文档集合
|
// 读取并转换为 Document 文档集合
|
||||||
return reader.get();
|
return reader.get();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(String filename, String contentType) {
|
||||||
|
return hasExtension(filename, "md", "markdown") ||
|
||||||
|
matchesContentType(contentType, "text/markdown", "text/x-markdown");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,24 +5,30 @@ import org.springframework.ai.reader.ExtractedTextFormatter;
|
|||||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyPdfReader {
|
public class MyPdfReader implements DocumentParser {
|
||||||
|
|
||||||
public List<Document> getDocsFromPdf() {
|
@Override
|
||||||
// 新建 PagePdfDocumentReader 阅读器
|
public List<Document> parse(MultipartFile file) {
|
||||||
PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/document/profile.pdf", // PDF 文件路径
|
PagePdfDocumentReader pdfReader = new PagePdfDocumentReader(
|
||||||
|
MultipartFileResource.of(file),
|
||||||
PdfDocumentReaderConfig.builder()
|
PdfDocumentReaderConfig.builder()
|
||||||
.withPageTopMargin(0) // 设置页面顶边距为0
|
.withPageTopMargin(0)
|
||||||
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
|
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
|
||||||
.withNumberOfTopTextLinesToDelete(0) // 设置删除顶部文本行数为0
|
.withNumberOfTopTextLinesToDelete(0)
|
||||||
.build())
|
.build())
|
||||||
.withPagesPerDocument(1) // 设置每个文档包含1页
|
.withPagesPerDocument(1)
|
||||||
.build());
|
.build());
|
||||||
|
|
||||||
// 读取并转换为 Document 文档集合
|
|
||||||
return pdfReader.read();
|
return pdfReader.read();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(String filename, String contentType) {
|
||||||
|
return hasExtension(filename, "pdf") || matchesContentType(contentType, "application/pdf");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,47 +2,27 @@ package com.hanserwei.chat.reader;
|
|||||||
|
|
||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.reader.TextReader;
|
import org.springframework.ai.reader.TextReader;
|
||||||
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyTextReader {
|
public class MyTextReader implements DocumentParser {
|
||||||
|
|
||||||
@Value("classpath:/document/manual.txt")
|
|
||||||
private Resource resource;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 读取 Txt 文档
|
* 读取 Txt 文档
|
||||||
* @return 读取的文档集合
|
* @return 读取的文档集合
|
||||||
*/
|
*/
|
||||||
public List<Document> loadText() {
|
@Override
|
||||||
// 创建 TextReader 对象,用于读取指定资源 (resource) 的文本内容
|
public List<Document> parse(MultipartFile file) {
|
||||||
TextReader textReader = new TextReader(resource);
|
TextReader textReader = new TextReader(MultipartFileResource.of(file));
|
||||||
// 添加自定义元数据,如文件名称
|
textReader.getCustomMetadata().put("filename", file.getOriginalFilename());
|
||||||
textReader.getCustomMetadata()
|
|
||||||
.put("filename", "manual.txt");
|
|
||||||
// 读取并转换为 Document 文档集合
|
|
||||||
return textReader.read();
|
return textReader.read();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
* 读取 Txt 文档并分块拆分
|
public boolean supports(String filename, String contentType) {
|
||||||
* @return 文档分块集合
|
return hasExtension(filename, "txt") || matchesContentType(contentType, "text/plain");
|
||||||
*/
|
|
||||||
public List<Document> loadTextAndSplit() {
|
|
||||||
// 创建 TextReader 对象,用于读取指定资源 (resource) 的文本内容
|
|
||||||
TextReader textReader = new TextReader(resource);
|
|
||||||
|
|
||||||
// 将资源内容解析为 Document 对象集合
|
|
||||||
List<Document> documents = textReader.get();
|
|
||||||
|
|
||||||
// 使用 TokenTextSplitter 对文档列表进行分块处理
|
|
||||||
|
|
||||||
// 返回拆分后的文档分块集合
|
|
||||||
return new TokenTextSplitter().apply(documents);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,21 +3,18 @@ package com.hanserwei.chat.reader;
|
|||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
||||||
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyTikaPptReader {
|
public class MyTikaPptReader implements DocumentParser {
|
||||||
|
|
||||||
@Value("classpath:/document/XX牌云感变频空调说明书.pptx")
|
@Override
|
||||||
private Resource resource;
|
public List<Document> parse(MultipartFile file) {
|
||||||
|
|
||||||
public List<Document> loadPpt() {
|
|
||||||
// 新建 TikaDocumentReader 阅读器
|
// 新建 TikaDocumentReader 阅读器
|
||||||
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(resource);
|
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(MultipartFileResource.of(file));
|
||||||
// 读取并转换为 Document 文档集合
|
// 读取并转换为 Document 文档集合
|
||||||
List<Document> documents = tikaDocumentReader.get();
|
List<Document> documents = tikaDocumentReader.get();
|
||||||
|
|
||||||
@@ -26,4 +23,12 @@ public class MyTikaPptReader {
|
|||||||
TokenTextSplitter splitter = new TokenTextSplitter(1000, 400, 10, 5000, true);
|
TokenTextSplitter splitter = new TokenTextSplitter(1000, 400, 10, 5000, true);
|
||||||
return splitter.apply(documents);
|
return splitter.apply(documents);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(String filename, String contentType) {
|
||||||
|
return hasExtension(filename, "ppt", "pptx") ||
|
||||||
|
matchesContentType(contentType,
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,21 +3,18 @@ package com.hanserwei.chat.reader;
|
|||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
||||||
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Component
|
@Component
|
||||||
public class MyTikaWordReader {
|
public class MyTikaWordReader implements DocumentParser {
|
||||||
|
|
||||||
@Value("classpath:/document/55f79946a0964b89bc7ab9b55e4a49ff.docx")
|
@Override
|
||||||
private Resource resource;
|
public List<Document> parse(MultipartFile file) {
|
||||||
|
|
||||||
public List<Document> loadWord() {
|
|
||||||
// 新建 TikaDocumentReader 阅读器
|
// 新建 TikaDocumentReader 阅读器
|
||||||
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(resource);
|
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(MultipartFileResource.of(file));
|
||||||
// 读取并转换为 Document 文档集合
|
// 读取并转换为 Document 文档集合
|
||||||
List<Document> documents = tikaDocumentReader.get();
|
List<Document> documents = tikaDocumentReader.get();
|
||||||
|
|
||||||
@@ -25,4 +22,12 @@ public class MyTikaWordReader {
|
|||||||
TokenTextSplitter splitter = new TokenTextSplitter(); // 不设置任何构造参数,表示使用默认设置
|
TokenTextSplitter splitter = new TokenTextSplitter(); // 不设置任何构造参数,表示使用默认设置
|
||||||
return splitter.apply(documents);
|
return splitter.apply(documents);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Override
|
||||||
|
public boolean supports(String filename, String contentType) {
|
||||||
|
return hasExtension(filename, "doc", "docx") ||
|
||||||
|
matchesContentType(contentType,
|
||||||
|
"application/msword",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,8 @@
|
|||||||
|
package com.hanserwei.chat.service;
|
||||||
|
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
public interface DocumentIngestionService {
|
||||||
|
|
||||||
|
int ingest(MultipartFile file);
|
||||||
|
}
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
package com.hanserwei.chat.service.impl;
|
||||||
|
|
||||||
|
import com.hanserwei.chat.reader.DocumentParser;
|
||||||
|
import com.hanserwei.chat.service.DocumentIngestionService;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.ai.document.Document;
|
||||||
|
import org.springframework.ai.vectorstore.VectorStore;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocumentIngestionServiceImpl implements DocumentIngestionService {
|
||||||
|
|
||||||
|
private final List<DocumentParser> documentParsers;
|
||||||
|
private final VectorStore vectorStore;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int ingest(MultipartFile file) {
|
||||||
|
if (file == null || file.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("上传文件不能为空");
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentParser parser = documentParsers.stream()
|
||||||
|
.filter(candidate -> candidate.supports(file.getOriginalFilename(), file.getContentType()))
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("不支持的文件类型:" + file.getOriginalFilename()));
|
||||||
|
|
||||||
|
List<Document> documents = parser.parse(file);
|
||||||
|
if (documents.isEmpty()) {
|
||||||
|
log.warn("文件 {} 解析后未生成任何文档,跳过入库。", file.getOriginalFilename());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
vectorStore.add(documents);
|
||||||
|
log.info("文件 {} 入库成功,共写入 {} 条向量。", file.getOriginalFilename(), documents.size());
|
||||||
|
return documents.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user