Java操作Word文档全攻略：读取doc与docx文件

一、技术选型对比

主流Java操作Word方案比较

graph TD
    A[Apache POI] --> B[doc/docx]
    C[docx4j] --> D[docx only]
    E[jacob] --> F[依赖MS Office]
    G[Aspose.Words] --> H[商业收费]

方案	支持格式	开源	功能完整性	学习曲线	适用场景
Apache POI	doc/docx	是	★★★★	中等	企业级应用
docx4j	docx	是	★★★	较陡	复杂文档处理
jacob	doc/docx	是	★★	高	Windows环境
Aspose	doc/docx	否	★★★★★	低	商业项目

二、Apache POI 核心操作

1. 环境配置

Maven依赖：

<!-- 核心库 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.3</version>
</dependency>
<!-- docx支持 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>5.2.3</version>
</dependency>
<!-- doc支持 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>5.2.3</version>
</dependency>

2. docx文件读取（XWPF）

public class DocxReader {
    public static void readDocx(String filePath) throws IOException {
        try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
            
            // 读取段落
            System.out.println("==== 段落 ====");
            doc.getParagraphs().forEach(p -> 
                System.out.println(p.getText()));
            
            // 读取表格
            System.out.println("==== 表格 ====");
            doc.getTables().forEach(table -> {
                table.getRows().forEach(row -> {
                    row.getTableCells().forEach(cell -> 
                        System.out.print(cell.getText() + "\t");
                    );
                    System.out.println();
                });
            });
            
            // 读取页眉页脚
            System.out.println("==== 页眉 ====");
            doc.getHeaderList().forEach(header -> 
                header.getParagraphs().forEach(p -> 
                    System.out.println(p.getText())));
            
            // 读取图片
            System.out.println("==== 图片数量 ====");
            System.out.println(doc.getAllPictures().size());
        }
    }
}

3. doc文件读取（HWPF）

public class DocReader {
    public static void readDoc(String filePath) throws IOException {
        try (HWPFDocument doc = new HWPFDocument(new FileInputStream(filePath))) {
            
            // 读取文本内容
            System.out.println("==== 全文内容 ====");
            System.out.println(doc.getDocumentText());
            
            // 按段落读取
            System.out.println("==== 段落 ====");
            Range range = doc.getRange();
            for (int i = 0; i < range.numParagraphs(); i++) {
                Paragraph para = range.getParagraph(i);
                System.out.println(para.text());
            }
            
            // 读取表格
            System.out.println("==== 表格 ====");
            TableIterator it = new TableIterator(range);
            while (it.hasNext()) {
                Table table = it.next();
                for (int r = 0; r < table.numRows(); r++) {
                    TableRow row = table.getRow(r);
                    for (int c = 0; c < row.numCells(); c++) {
                        TableCell cell = row.getCell(c);
                        System.out.print(cell.text().trim() + "\t");
                    }
                    System.out.println();
                }
            }
        }
    }
}

三、高级处理技巧

1. 样式信息提取

public void extractStyles(XWPFDocument doc) {
    doc.getParagraphs().forEach(p -> {
        System.out.println("段落文本: " + p.getText());
        System.out.println("对齐方式: " + p.getAlignment());
        System.out.println("缩进: " + p.getIndentationLeft());
        
        p.getRuns().forEach(run -> {
            System.out.println("-- 文本块: " + run);
            System.out.println("  字体: " + run.getFontFamily());
            System.out.println("  大小: " + run.getFontSize());
            System.out.println("  加粗: " + run.isBold());
            System.out.println("  斜体: " + run.isItalic());
        });
    });
}

2. 复杂表格处理

public void processComplexTable(XWPFTable table) {
    // 处理合并单元格
    Map<XWPFTableCell, List<XWPFTableCell>> mergedCells = new HashMap<>();
    
    // 遍历所有单元格
    for (XWPFTableRow row : table.getRows()) {
        for (XWPFTableCell cell : row.getTableCells()) {
            // 获取单元格位置信息
            CTTcPr tcPr = cell.getCTTc().getTcPr();
            if (tcPr != null) {
                // 处理跨列单元格
                if (tcPr.isSetGridSpan()) {
                    System.out.println("跨列单元格，跨度: " + tcPr.getGridSpan().getVal());
                }
                // 处理跨行单元格
                if (tcPr.isSetVMerge()) {
                    System.out.println("跨行单元格: " + tcPr.getVMerge().getVal());
                }
            }
        }
    }
    
    // 获取表格样式
    CTTblPr tblPr = table.getCTTbl().getTblPr();
    if (tblPr != null && tblPr.isSetTblStyle()) {
        System.out.println("表格样式: " + tblPr.getTblStyle().getVal());
    }
}

3. 批注和修订处理

public void processComments(XWPFDocument doc) {
    // 读取批注
    System.out.println("==== 批注 ====");
    doc.getComments().forEach(comment -> {
        System.out.println("作者: " + comment.getAuthor());
        System.out.println("日期: " + comment.getDate());
        System.out.println("内容: " + comment.getText());
    });
    
    // 处理修订记录
    System.out.println("==== 修订 ====");
    doc.getDocument().getBody().getPArray().forEach(p -> {
        if (p.getPPr() != null && p.getPPr().getRPr() != null) {
            CTRPrChange rprChange = p.getPPr().getRPr().getRPrChange();
            if (rprChange != null) {
                System.out.println("修改作者: " + rprChange.getAuthor());
                System.out.println("修改日期: " + rprChange.getDate());
            }
        }
    });
}

四、性能优化方案

1. 大文件处理策略

public void processLargeDocx(String filePath) throws IOException {
    try (OPCPackage pkg = OPCPackage.open(filePath);
         XWPFDocument doc = new XWPFDocument(pkg)) {
        
        // 使用SAX解析器处理大文件
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        System.out.println("核心文本: " + extractor.getText());
        
        // 分批处理段落
        int batchSize = 100;
        List<XWPFParagraph> paragraphs = doc.getParagraphs();
        for (int i = 0; i < paragraphs.size(); i += batchSize) {
            int end = Math.min(i + batchSize, paragraphs.size());
            List<XWPFParagraph> batch = paragraphs.subList(i, end);
            processParagraphBatch(batch);
        }
    }
}

private void processParagraphBatch(List<XWPFParagraph> batch) {
    batch.forEach(p -> {
        // 处理逻辑
    });
}

2. 内存管理技巧

// 使用临时文件处理
public void processWithTempFile(String filePath) throws IOException {
    Path tempFile = Files.createTempFile("word_", ".tmp");
    try {
        Files.copy(Paths.get(filePath), tempFile, StandardCopyOption.REPLACE_EXISTING);
        
        try (InputStream is = Files.newInputStream(tempFile);
             XWPFDocument doc = new XWPFDocument(is)) {
            // 文档处理逻辑
        }
    } finally {
        Files.deleteIfExists(tempFile);
    }
}

// 限制DOM解析范围
public void partialProcessing(String filePath) throws IOException {
    try (WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new File(filePath))) {
        // 只处理前10页
        List<Object> content = wordMLPackage.getMainDocumentPart().getContent();
        int pageCount = 0;
        for (Object obj : content) {
            if (obj instanceof P) {
                // 段落处理
                if (++pageCount > 10) break;
            }
        }
    }
}

五、异常处理与调试

常见异常处理

public void safeReadDocx(String filePath) {
    try {
        // 尝试docx格式
        readDocx(filePath);
    } catch (NotOfficeXmlFileException e) {
        try {
            // 尝试doc格式
            readDoc(filePath);
        } catch (IOException ex) {
            System.err.println("不支持的文档格式: " + ex.getMessage());
        }
    } catch (EncryptedDocumentException e) {
        System.err.println("加密文档无法读取: " + e.getMessage());
    } catch (IOException e) {
        System.err.println("文件读取错误: " + e.getMessage());
    } catch (Exception e) {
        System.err.println("未知错误: " + e.getClass().getName());
        e.printStackTrace();
    }
}

调试技巧

// 文档结构分析工具
public void analyzeDocStructure(String filePath) throws IOException {
    try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
        System.out.println("段落数量: " + doc.getParagraphs().size());
        System.out.println("表格数量: " + doc.getTables().size());
        System.out.println("页眉数量: " + doc.getHeaderList().size());
        System.out.println("页脚数量: " + doc.getFooterList().size());
        
        // 输出原始XML（调试用）
        doc.getDocument().write(System.out);
    }
}

// 样式调试器
public void debugStyles(XWPFParagraph p) {
    System.out.println("段落样式: " + p.getStyle());
    p.getRuns().forEach(run -> {
        System.out.println("Run文本: " + run);
        System.out.println("  Font: " + run.getFontFamily());
        System.out.println("  Size: " + run.getFontSize());
        System.out.println("  Color: " + run.getColor());
    });
}

六、扩展应用场景

1. 文档内容搜索

public List<String> searchInDocument(String filePath, String keyword) throws IOException {
    List<String> results = new ArrayList<>();
    
    try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
        // 搜索段落
        doc.getParagraphs().stream()
            .filter(p -> p.getText().contains(keyword))
            .forEach(p -> results.add("段落: " + p.getText()));
        
        // 搜索表格
        doc.getTables().forEach(table -> {
            table.getRows().forEach(row -> {
                row.getTableCells().forEach(cell -> {
                    if (cell.getText().contains(keyword)) {
                        results.add("表格单元格: " + cell.getText());
                    }
                });
            });
        });
        
        // 搜索页眉页脚
        doc.getHeaderList().forEach(header -> {
            header.getParagraphs().stream()
                .filter(p -> p.getText().contains(keyword))
                .forEach(p -> results.add("页眉: " + p.getText()));
        });
    }
    
    return results;
}

2. 文档比较工具

public void compareDocuments(String file1, String file2) throws IOException {
    try (XWPFDocument doc1 = new XWPFDocument(new FileInputStream(file1));
         XWPFDocument doc2 = new XWPFDocument(new FileInputStream(file2))) {
        
        // 简单文本比较
        String text1 = doc1.getParagraphs().stream()
            .map(XWPFParagraph::getText)
            .collect(Collectors.joining("\n"));
        
        String text2 = doc2.getParagraphs().stream()
            .map(XWPFParagraph::getText)
            .collect(Collectors.joining("\n"));
        
        System.out.println("文本差异: " + StringUtils.difference(text1, text2));
        
        // 结构比较
        System.out.println("段落数量差异: " + 
            (doc1.getParagraphs().size() - doc2.getParagraphs().size()));
        System.out.println("表格数量差异: " + 
            (doc1.getTables().size() - doc2.getTables().size()));
    }
}

3. 文档转换器（转HTML）

public String convertToHtml(XWPFDocument doc) {
    XHTMLOptions options = XHTMLOptions.create();
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    
    try {
        XHTMLConverter.getInstance().convert(doc, out, options);
        return out.toString("UTF-8");
    } catch (IOException e) {
        throw new RuntimeException("转换失败", e);
    }
}

七、企业级应用建议

1. 最佳实践清单

格式检测：通过文件魔数判断真实格式 public static boolean isDocx(File file) throws IOException { try (InputStream is = new FileInputStream(file)) { byte[] header = new byte[4]; is.read(header); return Arrays.equals(header, new byte[]{0x50, 0x4B, 0x03, 0x04}); } }
资源清理：确保关闭所有资源 public void safeProcess(File file) { try (InputStream is = new FileInputStream(file); OPCPackage pkg = OPCPackage.open(is); XWPFDocument doc = new XWPFDocument(pkg)) { // 处理逻辑 } catch (Exception e) { // 处理异常 } }
性能监控：添加处理指标 public void processWithMetrics(String filePath) { long start = System.currentTimeMillis(); try { // 处理逻辑 long duration = System.currentTimeMillis() - start; metrics.record("word.process.time", duration); } catch (Exception e) { metrics.increment("word.process.errors"); } }

2. 安全注意事项

防注入攻击： public void sanitizeContent(XWPFDocument doc) { doc.getParagraphs().forEach(p -> { String text = p.getText(); if (text.contains("<script>") || text.contains("<?xml")) { throw new SecurityException("危险内容检测"); } }); }
文件大小限制： public void validateFile(File file) { if (file.length() > 10 * 1024 * 1024) { // 10MB限制 throw new IllegalArgumentException("文件过大"); } }
病毒扫描集成： public void scanForViruses(File file) throws VirusFoundException { // 调用企业杀毒软件API if (virusScanner.scan(file)) { throw new VirusFoundException(); } }

八、替代方案补充

1. docx4j 核心示例

// docx4j读取示例
public void readWithDocx4j(String filePath) throws Exception {
    WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new File(filePath));
    MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();
    
    // 提取文本
    System.out.println(documentPart.getContent().toString());
    
    // 处理表格
    List<Object> tables = documentPart.getJAXBNodesViaXPath("//w:tbl", false);
    tables.forEach(table -> {
        Tbl tbl = (Tbl)table;
        // 表格处理逻辑
    });
    
    // 转换为HTML
    HTMLSettings htmlSettings = new HTMLSettings();
    String html = HtmlExporterNonXSLT.process(wordMLPackage, htmlSettings);
    System.out.println(html);
}

2. Aspose.Words 示例

// Aspose商业库示例
public void readWithAspose(String filePath) throws Exception {
    com.aspose.words.Document doc = new com.aspose.words.Document(filePath);
    
    // 提取文本
    System.out.println(doc.getText());
    
    // 高级操作
    DocumentBuilder builder = new DocumentBuilder(doc);
    builder.moveToHeaderFooter(HeaderFooterType.HEADER_PRIMARY);
    System.out.println("页眉: " + builder.getCurrentParagraph().getText());
    
    // 转换为PDF
    doc.save("output.pdf", SaveFormat.PDF);
}

九、综合实战案例

合同文档分析系统

public class ContractAnalyzer {
    public ContractAnalysisResult analyzeContract(String filePath) throws IOException {
        ContractAnalysisResult result = new ContractAnalysisResult();
        
        try (XWPFDocument doc = new XWPFDocument(new FileInputStream(filePath))) {
            // 1. 提取关键条款
            extractKeyClauses(doc, result);
            
            // 2. 分析签署方信息
            analyzeParties(doc, result);
            
            // 3. 检查重要日期
            checkImportantDates(doc, result);
            
            // 4. 验证签名区域
            validateSignatureBlocks(doc, result);
        }
        
        return result;
    }
    
    private void extractKeyClauses(XWPFDocument doc, ContractAnalysisResult result) {
        // 使用正则匹配关键条款
        Pattern clausePattern = Pattern.compile("第[一二三四五六七八九十]+条");
        
        doc.getParagraphs().forEach(p -> {
            Matcher m = clausePattern.matcher(p.getText());
            if (m.find()) {
                result.addClause(new Clause(m.group(), p.getText()));
            }
        });
    }
    
    // 其他分析方法...
}

// 分析结果对象
class ContractAnalysisResult {
    private List<Clause> clauses;
    private List<Party> parties;
    private Map<String, Date> importantDates;
    private boolean hasSignature;
    
    // getters/setters
}

// 使用示例
public static void main(String[] args) {
    ContractAnalyzer analyzer = new ContractAnalyzer();
    ContractAnalysisResult result = analyzer.analyzeContract("contract.docx");
    
    System.out.println("发现条款数量: " + result.getClauses().size());
    System.out.println("合同签署方: " + result.getParties());
}

通过本指南，您应该已经掌握了Java处理Word文档的核心技术。关键点总结：