From a9fce282ed7a858dffb8ea6022c242ef0581267a Mon Sep 17 00:00:00 2001 From: Hanserwei <2628273921@qq.com> Date: Fri, 31 Oct 2025 21:31:44 +0800 Subject: [PATCH] =?UTF-8?q?feat(document):=20=E5=AE=9E=E7=8E=B0=E5=A4=9A?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E6=96=87=E6=A1=A3=E4=B8=8A=E4=BC=A0=E4=B8=8E?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 移除 AiChatController 中的 PDF 读取相关逻辑与依赖- 新增 DocumentController 支持文件上传接口 - 新增 DocumentIngestionService 接口及实现,负责文档处理流程 - 抽象 DocumentParser 接口统一各类文档解析器行为 - 重构所有具体文档读取器(PDF、HTML、JSON 等)实现新的解析接口- 引入 MultipartFileResource 工具类以适配 Spring AI 读取器 - 添加 DocumentUploadResponse 响应模型类 - 各文档读取器增加对文件扩展名和 MIME 类型的支持判断 --- .../chat/controller/AiChatController.java | 16 ----- .../chat/controller/DocumentController.java | 29 +++++++++ .../chat/model/vo/DocumentUploadResponse.java | 4 ++ .../hanserwei/chat/reader/DocumentParser.java | 59 +++++++++++++++++++ .../chat/reader/MultipartFileResource.java | 36 +++++++++++ .../hanserwei/chat/reader/MyHtmlReader.java | 23 ++++---- .../hanserwei/chat/reader/MyJsonReader.java | 20 ++++--- .../chat/reader/MyMarkdownReader.java | 23 ++++---- .../hanserwei/chat/reader/MyPdfReader.java | 24 +++++--- .../hanserwei/chat/reader/MyTextReader.java | 40 ++++--------- .../chat/reader/MyTikaPptReader.java | 23 +++++--- .../chat/reader/MyTikaWordReader.java | 23 +++++--- .../service/DocumentIngestionService.java | 8 +++ .../impl/DocumentIngestionServiceImpl.java | 43 ++++++++++++++ 14 files changed, 269 insertions(+), 102 deletions(-) create mode 100644 snails-chat/src/main/java/com/hanserwei/chat/controller/DocumentController.java create mode 100644 snails-chat/src/main/java/com/hanserwei/chat/model/vo/DocumentUploadResponse.java create mode 100644 snails-chat/src/main/java/com/hanserwei/chat/reader/DocumentParser.java create mode 100644 snails-chat/src/main/java/com/hanserwei/chat/reader/MultipartFileResource.java create mode 100644 snails-chat/src/main/java/com/hanserwei/chat/service/DocumentIngestionService.java create mode 100644 snails-chat/src/main/java/com/hanserwei/chat/service/impl/DocumentIngestionServiceImpl.java diff --git a/snails-chat/src/main/java/com/hanserwei/chat/controller/AiChatController.java b/snails-chat/src/main/java/com/hanserwei/chat/controller/AiChatController.java index 30c6bae..e53ed3a 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/controller/AiChatController.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/controller/AiChatController.java @@ -2,20 +2,15 @@ package com.hanserwei.chat.controller; import com.hanserwei.chat.model.dto.ChatMessageDTO; import com.hanserwei.chat.model.vo.AIResponse; -import com.hanserwei.chat.reader.MyPdfReader; import com.hanserwei.chat.utils.ConversationContext; import jakarta.annotation.Resource; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.chat.memory.ChatMemory; -import org.springframework.ai.document.Document; -import org.springframework.ai.vectorstore.VectorStore; import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.*; import reactor.core.publisher.Flux; -import java.util.List; - @Slf4j @RestController @RequestMapping("/ai") @@ -23,10 +18,6 @@ public class AiChatController { @Resource private ChatClient dashScopeChatClient; - @Resource - private MyPdfReader myPdfReader; - @Resource - private VectorStore vectorStore; @PostMapping(path = "/chat", produces = MediaType.TEXT_EVENT_STREAM_VALUE) public Flux chatWithAi(@RequestBody ChatMessageDTO chatMessageDTO) { @@ -44,11 +35,4 @@ public class AiChatController { .contextWrite(ctx -> ConversationContext.withConversationId(chatMessageDTO.getConversionId())) .doFinally(signalType -> ConversationContext.clear()); } - - @GetMapping("/readpdf") - public String readPdf() { - List docsFromPdf = myPdfReader.getDocsFromPdf(); - vectorStore.add(docsFromPdf); - return "ok!"; - } } diff --git a/snails-chat/src/main/java/com/hanserwei/chat/controller/DocumentController.java b/snails-chat/src/main/java/com/hanserwei/chat/controller/DocumentController.java new file mode 100644 index 0000000..5505bdc --- /dev/null +++ b/snails-chat/src/main/java/com/hanserwei/chat/controller/DocumentController.java @@ -0,0 +1,29 @@ +package com.hanserwei.chat.controller; + +import com.hanserwei.chat.model.vo.DocumentUploadResponse; +import com.hanserwei.chat.service.DocumentIngestionService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +@Slf4j +@RestController +@RequestMapping("/documents") +@RequiredArgsConstructor +public class DocumentController { + + private final DocumentIngestionService documentIngestionService; + + @PostMapping(value = "/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public ResponseEntity upload(@RequestParam("file") MultipartFile file) { + int documentCount = documentIngestionService.ingest(file); + log.info("文件 {} 上传成功。", file.getOriginalFilename()); + return ResponseEntity.ok(new DocumentUploadResponse(file.getOriginalFilename(), documentCount, "ok")); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/model/vo/DocumentUploadResponse.java b/snails-chat/src/main/java/com/hanserwei/chat/model/vo/DocumentUploadResponse.java new file mode 100644 index 0000000..aafc3e9 --- /dev/null +++ b/snails-chat/src/main/java/com/hanserwei/chat/model/vo/DocumentUploadResponse.java @@ -0,0 +1,4 @@ +package com.hanserwei.chat.model.vo; + +public record DocumentUploadResponse(String filename, int documentCount, String message) { +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/DocumentParser.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/DocumentParser.java new file mode 100644 index 0000000..9378646 --- /dev/null +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/DocumentParser.java @@ -0,0 +1,59 @@ +package com.hanserwei.chat.reader; + +import org.springframework.ai.document.Document; +import org.springframework.lang.Nullable; +import org.springframework.web.multipart.MultipartFile; + +import java.util.List; + +/** + * 将上传文件转换为Spring AI {@link Document}实例的策略接口。 + */ +public interface DocumentParser { + + /** + * 判断此解析器是否可以处理提供的文件。 + * + * @param filename 客户端提供的原始文件名(可能为{@code null}) + * @param contentType 从上传中派生的MIME类型(可能为{@code null}) + * @return 如果此解析器应该处理该文件则返回{@code true} + */ + boolean supports(@Nullable String filename, @Nullable String contentType); + + /** + * 将文件解析为Spring AI文档。 + * + * @param file 多部分上传载荷 + * @return 解析后的文档列表 + */ + List parse(MultipartFile file); + + default boolean hasExtension(@Nullable String filename, String... extensions) { + if (filename == null) { + return false; + } + int index = filename.lastIndexOf('.'); + if (index < 0 || index == filename.length() - 1) { + return false; + } + String actualExtension = filename.substring(index + 1); + for (String extension : extensions) { + if (actualExtension.equalsIgnoreCase(extension)) { + return true; + } + } + return false; + } + + default boolean matchesContentType(@Nullable String contentType, String... supportedContentTypes) { + if (contentType == null) { + return false; + } + for (String supportedContentType : supportedContentTypes) { + if (contentType.equalsIgnoreCase(supportedContentType)) { + return true; + } + } + return false; + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MultipartFileResource.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MultipartFileResource.java new file mode 100644 index 0000000..94ac779 --- /dev/null +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MultipartFileResource.java @@ -0,0 +1,36 @@ +package com.hanserwei.chat.reader; + +import org.springframework.core.io.ByteArrayResource; +import org.springframework.web.multipart.MultipartFile; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Objects; + +/** + * 简单的 {@link ByteArrayResource},用于保留多部分文件的文件名。 + */ +final class MultipartFileResource extends ByteArrayResource { + + private final String filename; + + private MultipartFileResource(byte[] byteArray, String filename) { + super(byteArray); + this.filename = filename; + } + + static MultipartFileResource of(MultipartFile file) { + try { + String originalFilename = Objects.requireNonNullElse(file.getOriginalFilename(), "upload"); + return new MultipartFileResource(file.getBytes(), originalFilename); + } + catch (IOException ex) { + throw new UncheckedIOException("读取多部分文件内容失败", ex); + } + } + + @Override + public String getFilename() { + return filename; + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyHtmlReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyHtmlReader.java index 204f668..50aaef5 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyHtmlReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyHtmlReader.java @@ -3,32 +3,35 @@ package com.hanserwei.chat.reader; import org.springframework.ai.document.Document; import org.springframework.ai.reader.jsoup.JsoupDocumentReader; import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyHtmlReader { +public class MyHtmlReader implements DocumentParser { - @Value("classpath:/document/my-page.html") - private Resource resource; - - public List loadHtml() { + @Override + public List parse(MultipartFile file) { // JsoupDocumentReader 阅读器配置类 JsoupDocumentReaderConfig config = JsoupDocumentReaderConfig.builder() .selector("article p") // 提取
标签内的 p 段落 .charset("UTF-8") // 使用 UTF-8 编码 .includeLinkUrls(true) // 在元数据中包含链接 URL(绝对链接) .metadataTags(List.of("author", "date")) // 提取 author 和 date 元标签 - .additionalMetadata("source", "my-page.html") // 添加自定义元数据 + .additionalMetadata("source", file.getOriginalFilename()) // 添加自定义元数据 .build(); // 新建 JsoupDocumentReader 阅读器 - JsoupDocumentReader reader = new JsoupDocumentReader(resource, config); + JsoupDocumentReader reader = new JsoupDocumentReader(MultipartFileResource.of(file), config); // 读取并转换为 Document 文档集合 return reader.get(); } -} \ No newline at end of file + + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "html", "htm") || + matchesContentType(contentType, "text/html", "application/xhtml+xml"); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyJsonReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyJsonReader.java index 9004327..c961855 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyJsonReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyJsonReader.java @@ -2,26 +2,28 @@ package com.hanserwei.chat.reader; import org.springframework.ai.document.Document; import org.springframework.ai.reader.JsonReader; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyJsonReader { - - @Value("classpath:/document/tv.json") - private Resource resource; +public class MyJsonReader implements DocumentParser { /** * 读取 Json 文件 * @return */ - public List loadJson() { + @Override + public List parse(MultipartFile file) { // 创建 JsonReader 阅读器实例,配置需要读取的字段 - JsonReader jsonReader = new JsonReader(resource, "description", "content", "title"); + JsonReader jsonReader = new JsonReader(MultipartFileResource.of(file), "description", "content", "title"); // 执行读取操作,并转换为 Document 对象集合 return jsonReader.get(); } -} \ No newline at end of file + + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "json") || matchesContentType(contentType, "application/json"); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyMarkdownReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyMarkdownReader.java index 01f51d9..9c4a702 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyMarkdownReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyMarkdownReader.java @@ -3,31 +3,34 @@ package com.hanserwei.chat.reader; import org.springframework.ai.document.Document; import org.springframework.ai.reader.markdown.MarkdownDocumentReader; import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyMarkdownReader { +public class MyMarkdownReader implements DocumentParser { - @Value("classpath:/document/code.md") - private Resource resource; - - public List loadMarkdown() { + @Override + public List parse(MultipartFile file) { // MarkdownDocumentReader 阅读器配置类 MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() .withHorizontalRuleCreateDocument(true) // 遇到水平线 ---,则创建新文档 .withIncludeCodeBlock(false) // 排除代码块(代码块生成单独文档) .withIncludeBlockquote(false) // 排除块引用(块引用生成单独文档) - .withAdditionalMetadata("filename", "code.md") // 添加自定义元数据,如文件名称 + .withAdditionalMetadata("filename", file.getOriginalFilename()) // 添加自定义元数据,如文件名称 .build(); // 新建 MarkdownDocumentReader 阅读器 - MarkdownDocumentReader reader = new MarkdownDocumentReader(resource, config); + MarkdownDocumentReader reader = new MarkdownDocumentReader(MultipartFileResource.of(file), config); // 读取并转换为 Document 文档集合 return reader.get(); } -} \ No newline at end of file + + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "md", "markdown") || + matchesContentType(contentType, "text/markdown", "text/x-markdown"); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyPdfReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyPdfReader.java index d434598..dbfa934 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyPdfReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyPdfReader.java @@ -5,24 +5,30 @@ import org.springframework.ai.reader.ExtractedTextFormatter; import org.springframework.ai.reader.pdf.PagePdfDocumentReader; import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyPdfReader { +public class MyPdfReader implements DocumentParser { - public List getDocsFromPdf() { - // 新建 PagePdfDocumentReader 阅读器 - PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/document/profile.pdf", // PDF 文件路径 + @Override + public List parse(MultipartFile file) { + PagePdfDocumentReader pdfReader = new PagePdfDocumentReader( + MultipartFileResource.of(file), PdfDocumentReaderConfig.builder() - .withPageTopMargin(0) // 设置页面顶边距为0 + .withPageTopMargin(0) .withPageExtractedTextFormatter(ExtractedTextFormatter.builder() - .withNumberOfTopTextLinesToDelete(0) // 设置删除顶部文本行数为0 + .withNumberOfTopTextLinesToDelete(0) .build()) - .withPagesPerDocument(1) // 设置每个文档包含1页 + .withPagesPerDocument(1) .build()); - // 读取并转换为 Document 文档集合 return pdfReader.read(); } -} \ No newline at end of file + + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "pdf") || matchesContentType(contentType, "application/pdf"); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTextReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTextReader.java index 0198d63..60dd967 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTextReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTextReader.java @@ -2,47 +2,27 @@ package com.hanserwei.chat.reader; import org.springframework.ai.document.Document; import org.springframework.ai.reader.TextReader; -import org.springframework.ai.transformer.splitter.TokenTextSplitter; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyTextReader { - - @Value("classpath:/document/manual.txt") - private Resource resource; +public class MyTextReader implements DocumentParser { /** * 读取 Txt 文档 * @return 读取的文档集合 */ - public List loadText() { - // 创建 TextReader 对象,用于读取指定资源 (resource) 的文本内容 - TextReader textReader = new TextReader(resource); - // 添加自定义元数据,如文件名称 - textReader.getCustomMetadata() - .put("filename", "manual.txt"); - // 读取并转换为 Document 文档集合 + @Override + public List parse(MultipartFile file) { + TextReader textReader = new TextReader(MultipartFileResource.of(file)); + textReader.getCustomMetadata().put("filename", file.getOriginalFilename()); return textReader.read(); } - /** - * 读取 Txt 文档并分块拆分 - * @return 文档分块集合 - */ - public List loadTextAndSplit() { - // 创建 TextReader 对象,用于读取指定资源 (resource) 的文本内容 - TextReader textReader = new TextReader(resource); - - // 将资源内容解析为 Document 对象集合 - List documents = textReader.get(); - - // 使用 TokenTextSplitter 对文档列表进行分块处理 - - // 返回拆分后的文档分块集合 - return new TokenTextSplitter().apply(documents); + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "txt") || matchesContentType(contentType, "text/plain"); } -} \ No newline at end of file +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaPptReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaPptReader.java index 397ec43..06efd09 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaPptReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaPptReader.java @@ -3,21 +3,18 @@ package com.hanserwei.chat.reader; import org.springframework.ai.document.Document; import org.springframework.ai.reader.tika.TikaDocumentReader; import org.springframework.ai.transformer.splitter.TokenTextSplitter; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyTikaPptReader { +public class MyTikaPptReader implements DocumentParser { - @Value("classpath:/document/XX牌云感变频空调说明书.pptx") - private Resource resource; - - public List loadPpt() { + @Override + public List parse(MultipartFile file) { // 新建 TikaDocumentReader 阅读器 - TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(resource); + TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(MultipartFileResource.of(file)); // 读取并转换为 Document 文档集合 List documents = tikaDocumentReader.get(); @@ -26,4 +23,12 @@ public class MyTikaPptReader { TokenTextSplitter splitter = new TokenTextSplitter(1000, 400, 10, 5000, true); return splitter.apply(documents); } -} \ No newline at end of file + + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "ppt", "pptx") || + matchesContentType(contentType, + "application/vnd.ms-powerpoint", + "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaWordReader.java b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaWordReader.java index e1160c1..188fc7d 100644 --- a/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaWordReader.java +++ b/snails-chat/src/main/java/com/hanserwei/chat/reader/MyTikaWordReader.java @@ -3,21 +3,18 @@ package com.hanserwei.chat.reader; import org.springframework.ai.document.Document; import org.springframework.ai.reader.tika.TikaDocumentReader; import org.springframework.ai.transformer.splitter.TokenTextSplitter; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; +import org.springframework.web.multipart.MultipartFile; import java.util.List; @Component -public class MyTikaWordReader { +public class MyTikaWordReader implements DocumentParser { - @Value("classpath:/document/55f79946a0964b89bc7ab9b55e4a49ff.docx") - private Resource resource; - - public List loadWord() { + @Override + public List parse(MultipartFile file) { // 新建 TikaDocumentReader 阅读器 - TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(resource); + TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(MultipartFileResource.of(file)); // 读取并转换为 Document 文档集合 List documents = tikaDocumentReader.get(); @@ -25,4 +22,12 @@ public class MyTikaWordReader { TokenTextSplitter splitter = new TokenTextSplitter(); // 不设置任何构造参数,表示使用默认设置 return splitter.apply(documents); } -} \ No newline at end of file + + @Override + public boolean supports(String filename, String contentType) { + return hasExtension(filename, "doc", "docx") || + matchesContentType(contentType, + "application/msword", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + } +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/service/DocumentIngestionService.java b/snails-chat/src/main/java/com/hanserwei/chat/service/DocumentIngestionService.java new file mode 100644 index 0000000..b90192b --- /dev/null +++ b/snails-chat/src/main/java/com/hanserwei/chat/service/DocumentIngestionService.java @@ -0,0 +1,8 @@ +package com.hanserwei.chat.service; + +import org.springframework.web.multipart.MultipartFile; + +public interface DocumentIngestionService { + + int ingest(MultipartFile file); +} diff --git a/snails-chat/src/main/java/com/hanserwei/chat/service/impl/DocumentIngestionServiceImpl.java b/snails-chat/src/main/java/com/hanserwei/chat/service/impl/DocumentIngestionServiceImpl.java new file mode 100644 index 0000000..ee57329 --- /dev/null +++ b/snails-chat/src/main/java/com/hanserwei/chat/service/impl/DocumentIngestionServiceImpl.java @@ -0,0 +1,43 @@ +package com.hanserwei.chat.service.impl; + +import com.hanserwei.chat.reader.DocumentParser; +import com.hanserwei.chat.service.DocumentIngestionService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.ai.document.Document; +import org.springframework.ai.vectorstore.VectorStore; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import java.util.List; + +@Slf4j +@Service +@RequiredArgsConstructor +public class DocumentIngestionServiceImpl implements DocumentIngestionService { + + private final List documentParsers; + private final VectorStore vectorStore; + + @Override + public int ingest(MultipartFile file) { + if (file == null || file.isEmpty()) { + throw new IllegalArgumentException("上传文件不能为空"); + } + + DocumentParser parser = documentParsers.stream() + .filter(candidate -> candidate.supports(file.getOriginalFilename(), file.getContentType())) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("不支持的文件类型:" + file.getOriginalFilename())); + + List documents = parser.parse(file); + if (documents.isEmpty()) { + log.warn("文件 {} 解析后未生成任何文档,跳过入库。", file.getOriginalFilename()); + return 0; + } + + vectorStore.add(documents); + log.info("文件 {} 入库成功,共写入 {} 条向量。", file.getOriginalFilename(), documents.size()); + return documents.size(); + } +}