文章评分
次,平均分 :
-
基于 fitz 库和正则搜索提取图片 -
基于 pdf2image 库的两种方法提取图片
基于 fitz 库和正则搜索
3import os
4
5file_path = r'C:\xxx\xxx.pdf' # PDF 文件路径
6dir_path = r'C:\xxx' # 存放图片的文件夹
7
8def pdf2image1(path, pic_path):
9 checkIM = r"/Subtype(?= */Image)"
10 pdf = fitz.open(path)
11 lenXREF = pdf._getXrefLength()
12 count = 1
13 for i in range(1, lenXREF):
14 text = pdf._getXrefString(i)
15 isImage = re.search(checkIM, text)
16 if not isImage:
17 continue
18 pix = fitz.Pixmap(pdf, i)
19 new_name = f"img_{count}.png"
20 pix.writePNG(os.path.join(pic_path, new_name))
21 count += 1
22 pix = None
23
24pdf2image1(file_path, dir_path)

2import re
3import os
4
5file_path = r'C:\xxx\xxx.pdf' # PDF 文件路径
6dir_path = r'C:\xxx' # 存放图片的文件夹
7
8def pdf2image1(path, pic_path):
9 checkIM = r"/Subtype(?= */Image)"
10 pdf = fitz.open(path)
11 lenXREF = pdf._getXrefLength()
12 count = 1
13 for i in range(1, lenXREF):
14 text = pdf._getXrefString(i)
15 isImage = re.search(checkIM, text)
16 if not isImage:
17 continue
18 pix = fitz.Pixmap(pdf, i)
19 if pix.size < 10000: # 在这里添加一处判断一个循环
20 continue # 不符合阈值则跳过至下
21 new_name = f"img_{count}.png"
22 pix.writePNG(os.path.join(pic_path, new_name))
23 count += 1
24 pix = None
25
26pdf2image1(file_path, dir_path)

-
windows用户必须安装poppler for Windows,然后将bin/文件夹添加到PATH -
Mac用户必须安装poppler for Mac

2import tempfile
3from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError
4import os
5
6file_path = r'C:\xxx\xxx.pdf' # PDF 文件路径
7dir_path = r'C:\xxx' # 存放图片的文件夹
8
9def pdf2image2(file_path, dir_path):
10 images = convert_from_path(file_path, dpi=200)
11 for image in images:
12 if not os.path.exists(dir_path):
13 os.makedirs(dir_path)
14 image.save(file_path + f'\img_{images.index(image)}.png', 'PNG')
15
16pdf2image2(file_path, dir_path)
2import tempfile
3from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError, PDFSyntaxError
4import os
5
6file_path = r'C:\xxx\xxx.pdf' # PDF 文件路径
7dir_path = r'C:\xxx' # 存放图片的文件夹
8
9def pdf2image3(file_path, dir_path):
10 images = convert_from_bytes(open(file_path, 'rb').read())
11 for image in images:
12 if not os.path.exists(dir_path):
13 os.makedirs(dir_path)
14 image.save(file_path + f'\img_{images.index(image)}.png', 'PNG')
15
16pdf2image3(file_path, dir_path)

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
除特别注明外,本站所有文章均为安装打印机网原创,转载请注明出处来自https://www.azdyj.com/15956.html
暂无评论