用Csharp调用itext读取pdf文件,并提取出文本,相邻文本距离超过一个字宽度的用空格补齐。
时间: 2024-05-12 09:19:04 浏览: 178
以下是使用C#调用itext读取pdf文件并提取文本的示例代码:
```csharp
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using System.Text;
public static string ExtractTextFromPDF(string filepath)
{
StringBuilder sb = new StringBuilder();
using (PdfReader reader = new PdfReader(filepath))
{
using (PdfDocument pdfDoc = new PdfDocument(reader))
{
for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
{
PdfPage page = pdfDoc.GetPage(i);
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string text = PdfTextExtractor.GetTextFromPage(page, strategy);
sb.Append(text);
}
}
}
return sb.ToString();
}
```
为了实现相邻文本距离超过一个字宽度的用空格补齐的功能,我们可以在获取文本时添加一些额外的处理:
```csharp
public static string ExtractTextFromPDF(string filepath)
{
StringBuilder sb = new StringBuilder();
using (PdfReader reader = new PdfReader(filepath))
{
using (PdfDocument pdfDoc = new PdfDocument(reader))
{
for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
{
PdfPage page = pdfDoc.GetPage(i);
CustomTextExtractionStrategy strategy = new CustomTextExtractionStrategy();
string text = PdfTextExtractor.GetTextFromPage(page, strategy);
// 将相邻文本距离超过一个字宽度的用空格补齐
text = strategy.FillInSpaceBetweenWords(text);
sb.Append(text);
}
}
}
return sb.ToString();
}
public class CustomTextExtractionStrategy : LocationTextExtractionStrategy
{
// 记录当前字符的左上角坐标
private Vector lastStart;
public override void BeginTextBlock()
{
base.BeginTextBlock();
lastStart = null;
}
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
Vector start = renderInfo.GetDescentLine().GetStartPoint();
Vector end = renderInfo.GetAscentLine().GetEndPoint();
if (lastStart != null)
{
// 计算当前字符的左上角坐标和上一个字符的右上角坐标之间的距离
float distance = start.Subtract(lastStart).Length();
// 如果距离大于一个字宽度,则认为中间需要插入一个空格
if (distance > renderInfo.GetSingleSpaceWidth())
{
AppendText(" ");
}
}
AppendText(renderInfo.GetText());
lastStart = start;
}
// 将相邻文本距离超过一个字宽度的用空格补齐
public string FillInSpaceBetweenWords(string text)
{
StringBuilder sb = new StringBuilder();
char[] chars = text.ToCharArray();
for (int i = 0; i < chars.Length; i++)
{
sb.Append(chars[i]);
if (i < chars.Length - 1)
{
// 计算当前字符和下一个字符的距离
float distance = GetDistanceBetweenChars(chars[i], chars[i + 1]);
// 如果距离大于一个字宽度,则认为中间需要插入一个空格
if (distance > GetSingleSpaceWidth())
{
sb.Append(" ");
}
}
}
return sb.ToString();
}
// 获取两个字符之间的距离
private float GetDistanceBetweenChars(char c1, char c2)
{
Glyph glyph1 = font.GetGlyph(c1);
Glyph glyph2 = font.GetGlyph(c2);
return glyph1.GetWidth() + glyph2.GetWidth() - glyph1.GetBoundingBox().GetRight();
}
// 获取一个空格的宽度
public float GetSingleSpaceWidth()
{
Glyph glyph = font.GetGlyph(' ');
return glyph.GetWidth();
}
}
```
阅读全文
相关推荐


















