pdf reader
This commit is contained in:
7
VideoCompress/.gitignore
vendored
Normal file
7
VideoCompress/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
logs
|
||||||
|
test
|
||||||
|
config.json
|
||||||
|
*.xml
|
||||||
|
tmp
|
||||||
|
build
|
||||||
|
dist
|
36
pdf_index/README.md
Normal file
36
pdf_index/README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Streamlit PDF 目录查看器
|
||||||
|
|
||||||
|
一个最小可运行的 Streamlit 应用:
|
||||||
|
- 读取 `doc.pdf`(放在当前目录),或通过页面上传 PDF。
|
||||||
|
- 使用 PyMuPDF 提取 PDF 目录(Table of Contents)。
|
||||||
|
- 在下拉框选择目录项后,显示该目录项对应的页面范围(到下一个目录项前一页)。
|
||||||
|
- 使用 `st.pdf` 组件内嵌查看选定页面范围的临时 PDF。
|
||||||
|
|
||||||
|
## 快速开始(Windows / cmd)
|
||||||
|
|
||||||
|
1) 建议创建虚拟环境(可选)
|
||||||
|
```cmd
|
||||||
|
python -m venv .venv
|
||||||
|
.venv\Scripts\activate
|
||||||
|
```
|
||||||
|
|
||||||
|
2) 安装依赖
|
||||||
|
```cmd
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3) 将你的 PDF 放到本目录并命名为 `doc.pdf`(或在页面中上传)。
|
||||||
|
|
||||||
|
4) 运行应用
|
||||||
|
```cmd
|
||||||
|
streamlit run app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## 用法说明
|
||||||
|
- 左侧边栏可上传 PDF;若本地存在 `doc.pdf`,也会自动被加载。
|
||||||
|
- 目录下拉框显示形如 `title (page)`。
|
||||||
|
- 若 PDF 无目录,本应用会提示;可选择“全部页面”查看。
|
||||||
|
|
||||||
|
## 已知限制
|
||||||
|
- 目录页码通常为 PDF 内部页码(从 1 开始),个别 PDF 的 TOC 可能与实际页面偏移不一致。
|
||||||
|
- 页面范围切片依赖 TOC 顺序,若 TOC 不规范可能导致范围不准。
|
121
pdf_index/app.py
Normal file
121
pdf_index/app.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, List, Optional, Tuple
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import streamlit as st
|
||||||
|
from streamlit_pdf_viewer import pdf_viewer
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TocItem:
|
||||||
|
level: int
|
||||||
|
title: str
|
||||||
|
page_from: int # 1-based
|
||||||
|
page_to: Optional[int] # 1-based inclusive; None means until end
|
||||||
|
|
||||||
|
|
||||||
|
def read_toc(doc: fitz.Document) -> List[Tuple[int, str, int]]:
|
||||||
|
# Returns list of (level, title, page) where page is 1-based per PyMuPDF
|
||||||
|
toc: List[Tuple[int, str, int]] = []
|
||||||
|
try:
|
||||||
|
get_toc: Any = getattr(doc, "get_toc", None)
|
||||||
|
if callable(get_toc):
|
||||||
|
toc = get_toc(simple=True) # type: ignore[no-any-return]
|
||||||
|
except Exception:
|
||||||
|
toc = []
|
||||||
|
return [(lvl, title, max(1, pg)) for (lvl, title, pg) in toc]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_ranges(toc: List[Tuple[int, str, int]], page_count: int) -> List[TocItem]:
|
||||||
|
if not toc:
|
||||||
|
return []
|
||||||
|
items: List[TocItem] = []
|
||||||
|
for i, (lvl, title, page) in enumerate(toc):
|
||||||
|
start = min(max(1, page), page_count)
|
||||||
|
if i + 1 < len(toc):
|
||||||
|
next_page = toc[i + 1][2]
|
||||||
|
end = max(1, min(page_count, next_page - 1))
|
||||||
|
if end < start:
|
||||||
|
end = start
|
||||||
|
else:
|
||||||
|
end = page_count
|
||||||
|
items.append(TocItem(level=lvl, title=title, page_from=start, page_to=end))
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def _hash_doc(doc:fitz.Document):
|
||||||
|
"This is a fake hash, ENSURE GLOBAL DOCUMENT SAME"
|
||||||
|
return "12"
|
||||||
|
|
||||||
|
@st.cache_resource(hash_funcs={fitz.Document:_hash_doc})
|
||||||
|
def slice_pdf_pages(src_doc: fitz.Document, page_from: int, page_to: int) -> bytes:
|
||||||
|
# Create a new PDF with selected 1-based inclusive page range
|
||||||
|
new_pdf = fitz.open()
|
||||||
|
try:
|
||||||
|
start_i = max(1, page_from) - 1
|
||||||
|
end_i = max(start_i, page_to - 1)
|
||||||
|
for p in range(start_i, min(end_i, src_doc.page_count - 1) + 1):
|
||||||
|
new_pdf.insert_pdf(src_doc, from_page=p, to_page=p)
|
||||||
|
out = new_pdf.tobytes()
|
||||||
|
return out
|
||||||
|
finally:
|
||||||
|
new_pdf.close()
|
||||||
|
src_doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def format_label(item: TocItem) -> str:
|
||||||
|
return f"{item.title} ({item.page_from:03d} - {item.page_to:03d})"
|
||||||
|
|
||||||
|
@st.cache_resource
|
||||||
|
def read_pdf():
|
||||||
|
pdf_path = Path("doc.pdf")
|
||||||
|
if not pdf_path.exists():
|
||||||
|
st.error("找不到doc.pdf")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
doc = fitz.open(pdf_path, filetype="pdf")
|
||||||
|
def _close_doc():
|
||||||
|
"Never close doc due to cache in global streamlit app."
|
||||||
|
pass
|
||||||
|
doc.close = _close_doc
|
||||||
|
page_count = doc.page_count
|
||||||
|
|
||||||
|
# 读取目录
|
||||||
|
raw_toc = read_toc(doc)
|
||||||
|
items = normalize_ranges(raw_toc, page_count)
|
||||||
|
|
||||||
|
return doc,page_count,items
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
st.set_page_config(page_title="PDF 目录查看器", layout="wide")
|
||||||
|
st.title("PDF 目录查看器")
|
||||||
|
|
||||||
|
doc,page_count,items = read_pdf()
|
||||||
|
|
||||||
|
st.subheader("目录")
|
||||||
|
labels = ["请选择"] + [format_label(it) for it in items]
|
||||||
|
selection = st.selectbox("选择章节", labels, index=0)
|
||||||
|
if selection == "请选择":
|
||||||
|
st.stop()
|
||||||
|
idx = labels.index(selection)
|
||||||
|
chosen = items[idx]
|
||||||
|
selected_range = (chosen.page_from, chosen.page_to or page_count)
|
||||||
|
|
||||||
|
rng_from, rng_to = selected_range
|
||||||
|
|
||||||
|
st.subheader("预览")
|
||||||
|
try:
|
||||||
|
sliced_bytes = slice_pdf_pages(doc, rng_from, rng_to)
|
||||||
|
st.download_button("下载",sliced_bytes,file_name=f"{chosen.title}.pdf")
|
||||||
|
pdf_viewer(sliced_bytes,rendering=st.session_state.get("render","unwrap"))
|
||||||
|
# st.pdf(io.BytesIO(sliced_bytes), height=height, key="pdf_preview")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"渲染失败:{e}")
|
||||||
|
|
||||||
|
st.selectbox("使用其他渲染方式",["unwrap","legacy_embed","legacy_iframe"],key="render")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
3
pdf_index/requirements.txt
Normal file
3
pdf_index/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
streamlit>=1.37.0
|
||||||
|
pymupdf>=1.24.0
|
||||||
|
streamlit_pdf_viewer
|
Reference in New Issue
Block a user