用python3写一段代码提取pdf和ofd格式发票内容并json格式化输出
时间: 2023-11-28 17:49:11 浏览: 110
为了提取PDF和OFD格式的发票内容,我们可以使用第三方库PyMuPDF和ofdreader。在此之前,需要安装这些库。可以使用以下命令进行安装:
```
pip install PyMuPDF ofdreader
```
以下是提取PDF和OFD格式发票内容并JSON格式化输出的代码:
```python
import fitz
from ofdreader import OFD
def extract_pdf_invoice(pdf_file):
doc = fitz.open(pdf_file)
page = doc.loadPage(0)
text = page.getText()
invoice_data = {}
for line in text.split('\n'):
if '发票代码' in line:
invoice_data['invoice_code'] = line.split(':')[1].strip()
elif '发票号码' in line:
invoice_data['invoice_number'] = line.split(':')[1].strip()
elif '开票日期' in line:
invoice_data['invoice_date'] = line.split(':')[1].strip()
elif '购买方名称' in line:
invoice_data['buyer_name'] = line.split(':')[1].strip()
elif '税前金额' in line:
invoice_data['pre_tax_amount'] = line.split(':')[1].strip()
elif '税额' in line:
invoice_data['tax_amount'] = line.split(':')[1].strip()
elif '价税合计' in line:
invoice_data['total_amount'] = line.split(':')[1].strip()
return invoice_data
def extract_ofd_invoice(ofd_file):
ofd = OFD(ofd_file)
invoice_data = {}
for page in ofd.pages:
text = page.extract_text().replace('\n', '')
if '发票代码' in text:
invoice_data['invoice_code'] = text.split('发票代码')[1].split('发票号码')[0].strip()
if '发票号码' in text:
invoice_data['invoice_number'] = text.split('发票号码')[1].split('开票日期')[0].strip()
if '开票日期' in text:
invoice_data['invoice_date'] = text.split('开票日期')[1].split('购买方名称')[0].strip()
if '购买方名称' in text:
invoice_data['buyer_name'] = text.split('购买方名称')[1].split('价税合计')[0].strip()
if '税前金额' in text:
invoice_data['pre_tax_amount'] = text.split('税前金额')[1].split('税额')[0].strip()
if '税额' in text:
invoice_data['tax_amount'] = text.split('税额')[1].split('价税合计')[0].strip()
if '价税合计' in text:
invoice_data['total_amount'] = text.split('价税合计')[1].strip()
return invoice_data
pdf_invoice = extract_pdf_invoice('invoice.pdf')
ofd_invoice = extract_ofd_invoice('invoice.ofd')
print(json.dumps(pdf_invoice, indent=4))
print(json.dumps(ofd_invoice, indent=4))
```
在上面的代码中,我们定义了两个函数`extract_pdf_invoice()`和`extract_ofd_invoice()`,分别用于提取PDF和OFD格式的发票内容。这些函数将返回一个包含发票信息的字典对象。最后,我们使用`json.dumps()`函数将这些字典对象转换为JSON格式,并使用`print()`函数打印输出。