Java代码实现pdf文件中的表格以及内容转成excel文件输出
时间: 2024-03-02 10:49:22 浏览: 450
可以使用Apache PDFBox和Apache POI这两个Java库来实现将PDF文件中的表格以及内容转成Excel文件输出的功能。具体实现步骤如下:
1. 引入依赖
在项目的pom.xml文件中添加以下依赖:
```
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.23</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.0.0</version>
</dependency>
```
2. 加载PDF文件
使用PDFBox库中的PDFTextStripper类将PDF文件中的文本内容提取出来,然后再使用PDFBox库中的PDFTableStripper类将PDF文件中的表格内容提取出来。具体代码如下:
```
PDDocument document = PDDocument.load(new File("test.pdf"));
PDFTextStripper textStripper = new PDFTextStripper();
String text = textStripper.getText(document);
PDFTableStripper tableStripper = new PDFTableStripper();
tableStripper.setSortByPosition(true);
tableStripper.setStartPage(0);
tableStripper.setEndPage(document.getNumberOfPages());
List<List<Rectangle>> rows = tableStripper.getRows(document.getPage(0));
```
3. 将表格内容转成二维数组
根据表格的行列信息,将表格内容转成二维数组。具体代码如下:
```
List<List<String>> table = new ArrayList<>();
for (List<Rectangle> row : rows) {
List<String> rowData = new ArrayList<>();
for (Rectangle cell : row) {
int x = (int) cell.getX();
int y = (int) cell.getY();
int width = (int) cell.getWidth();
int height = (int) cell.getHeight();
String cellText = text.substring(text.indexOf("(" + x + "," + y + ")"), text.indexOf("(" + (x + width) + "," + (y + height) + ")"));
rowData.add(cellText.trim());
}
table.add(rowData);
}
```
4. 将表格内容写入Excel文件
使用POI库中的Workbook和Sheet类创建Excel文件和工作表,然后将表格内容写入工作表中。具体代码如下:
```
Workbook workbook = new XSSFWorkbook();
Sheet sheet = workbook.createSheet();
for (int i = 0; i < table.size(); i++) {
Row row = sheet.createRow(i);
List<String> rowData = table.get(i);
for (int j = 0; j < rowData.size(); j++) {
Cell cell = row.createCell(j);
cell.setCellValue(rowData.get(j));
}
}
FileOutputStream outputStream = new FileOutputStream("test.xlsx");
workbook.write(outputStream);
workbook.close();
outputStream.close();
```
完整代码如下:
```
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.util.Rectangle;
import org.apache.pdfbox.util.Splitter;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.interactive.form.*;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;
public class PdfToExcel {
public static void main(String[] args) throws Exception {
PDDocument document = PDDocument.load(new File("test.pdf"));
PDFTextStripper textStripper = new PDFTextStripper();
String text = textStripper.getText(document);
PDFTableStripper tableStripper = new PDFTableStripper();
tableStripper.setSortByPosition(true);
tableStripper.setStartPage(0);
tableStripper.setEndPage(document.getNumberOfPages());
List<List<Rectangle>> rows = tableStripper.getRows(document.getPage(0));
List<List<String>> table = new ArrayList<>();
for (List<Rectangle> row : rows) {
List<String> rowData = new ArrayList<>();
for (Rectangle cell : row) {
int x = (int) cell.getX();
int y = (int) cell.getY();
int width = (int) cell.getWidth();
int height = (int) cell.getHeight();
String cellText = text.substring(text.indexOf("(" + x + "," + y + ")"), text.indexOf("(" + (x + width) + "," + (y + height) + ")"));
rowData.add(cellText.trim());
}
table.add(rowData);
}
Workbook workbook = new XSSFWorkbook();
Sheet sheet = workbook.createSheet();
for (int i = 0; i < table.size(); i++) {
Row row = sheet.createRow(i);
List<String> rowData = table.get(i);
for (int j = 0; j < rowData.size(); j++) {
Cell cell = row.createCell(j);
cell.setCellValue(rowData.get(j));
}
}
FileOutputStream outputStream = new FileOutputStream("test.xlsx");
workbook.write(outputStream);
workbook.close();
outputStream.close();
}
}
```
阅读全文