diff --git a/VideoCompress/.gitignore b/VideoCompress/.gitignore new file mode 100644 index 0000000..2e29888 --- /dev/null +++ b/VideoCompress/.gitignore @@ -0,0 +1,7 @@ +logs +test +config.json +*.xml +tmp +build +dist \ No newline at end of file diff --git a/pdf_index/README.md b/pdf_index/README.md new file mode 100644 index 0000000..bfd7908 --- /dev/null +++ b/pdf_index/README.md @@ -0,0 +1,36 @@ +# Streamlit PDF 目录查看器 + +一个最小可运行的 Streamlit 应用: +- 读取 `doc.pdf`(放在当前目录),或通过页面上传 PDF。 +- 使用 PyMuPDF 提取 PDF 目录(Table of Contents)。 +- 在下拉框选择目录项后,显示该目录项对应的页面范围(到下一个目录项前一页)。 +- 使用 `st.pdf` 组件内嵌查看选定页面范围的临时 PDF。 + +## 快速开始(Windows / cmd) + +1) 建议创建虚拟环境(可选) +```cmd +python -m venv .venv +.venv\Scripts\activate +``` + +2) 安装依赖 +```cmd +pip install -r requirements.txt +``` + +3) 将你的 PDF 放到本目录并命名为 `doc.pdf`(或在页面中上传)。 + +4) 运行应用 +```cmd +streamlit run app.py +``` + +## 用法说明 +- 左侧边栏可上传 PDF;若本地存在 `doc.pdf`,也会自动被加载。 +- 目录下拉框显示形如 `title (page)`。 +- 若 PDF 无目录,本应用会提示;可选择“全部页面”查看。 + +## 已知限制 +- 目录页码通常为 PDF 内部页码(从 1 开始),个别 PDF 的 TOC 可能与实际页面偏移不一致。 +- 页面范围切片依赖 TOC 顺序,若 TOC 不规范可能导致范围不准。 diff --git a/pdf_index/app.py b/pdf_index/app.py new file mode 100644 index 0000000..7c69fe8 --- /dev/null +++ b/pdf_index/app.py @@ -0,0 +1,121 @@ +from dataclasses import dataclass +from typing import Any, List, Optional, Tuple +from pathlib import Path + +import fitz # PyMuPDF +import streamlit as st +from streamlit_pdf_viewer import pdf_viewer + + +@dataclass +class TocItem: + level: int + title: str + page_from: int # 1-based + page_to: Optional[int] # 1-based inclusive; None means until end + + +def read_toc(doc: fitz.Document) -> List[Tuple[int, str, int]]: + # Returns list of (level, title, page) where page is 1-based per PyMuPDF + toc: List[Tuple[int, str, int]] = [] + try: + get_toc: Any = getattr(doc, "get_toc", None) + if callable(get_toc): + toc = get_toc(simple=True) # type: ignore[no-any-return] + except Exception: + toc = [] + return [(lvl, title, max(1, pg)) for (lvl, title, pg) in toc] + + +def normalize_ranges(toc: List[Tuple[int, str, int]], page_count: int) -> List[TocItem]: + if not toc: + return [] + items: List[TocItem] = [] + for i, (lvl, title, page) in enumerate(toc): + start = min(max(1, page), page_count) + if i + 1 < len(toc): + next_page = toc[i + 1][2] + end = max(1, min(page_count, next_page - 1)) + if end < start: + end = start + else: + end = page_count + items.append(TocItem(level=lvl, title=title, page_from=start, page_to=end)) + return items + + +def _hash_doc(doc:fitz.Document): + "This is a fake hash, ENSURE GLOBAL DOCUMENT SAME" + return "12" + +@st.cache_resource(hash_funcs={fitz.Document:_hash_doc}) +def slice_pdf_pages(src_doc: fitz.Document, page_from: int, page_to: int) -> bytes: + # Create a new PDF with selected 1-based inclusive page range + new_pdf = fitz.open() + try: + start_i = max(1, page_from) - 1 + end_i = max(start_i, page_to - 1) + for p in range(start_i, min(end_i, src_doc.page_count - 1) + 1): + new_pdf.insert_pdf(src_doc, from_page=p, to_page=p) + out = new_pdf.tobytes() + return out + finally: + new_pdf.close() + src_doc.close() + + +def format_label(item: TocItem) -> str: + return f"{item.title} ({item.page_from:03d} - {item.page_to:03d})" + +@st.cache_resource +def read_pdf(): + pdf_path = Path("doc.pdf") + if not pdf_path.exists(): + st.error("找不到doc.pdf") + st.stop() + + doc = fitz.open(pdf_path, filetype="pdf") + def _close_doc(): + "Never close doc due to cache in global streamlit app." + pass + doc.close = _close_doc + page_count = doc.page_count + + # 读取目录 + raw_toc = read_toc(doc) + items = normalize_ranges(raw_toc, page_count) + + return doc,page_count,items + + +def main(): + st.set_page_config(page_title="PDF 目录查看器", layout="wide") + st.title("PDF 目录查看器") + + doc,page_count,items = read_pdf() + + st.subheader("目录") + labels = ["请选择"] + [format_label(it) for it in items] + selection = st.selectbox("选择章节", labels, index=0) + if selection == "请选择": + st.stop() + idx = labels.index(selection) + chosen = items[idx] + selected_range = (chosen.page_from, chosen.page_to or page_count) + + rng_from, rng_to = selected_range + + st.subheader("预览") + try: + sliced_bytes = slice_pdf_pages(doc, rng_from, rng_to) + st.download_button("下载",sliced_bytes,file_name=f"{chosen.title}.pdf") + pdf_viewer(sliced_bytes,rendering=st.session_state.get("render","unwrap")) + # st.pdf(io.BytesIO(sliced_bytes), height=height, key="pdf_preview") + except Exception as e: + st.error(f"渲染失败:{e}") + + st.selectbox("使用其他渲染方式",["unwrap","legacy_embed","legacy_iframe"],key="render") + + +if __name__ == "__main__": + main() diff --git a/pdf_index/requirements.txt b/pdf_index/requirements.txt new file mode 100644 index 0000000..77d4d50 --- /dev/null +++ b/pdf_index/requirements.txt @@ -0,0 +1,3 @@ +streamlit>=1.37.0 +pymupdf>=1.24.0 +streamlit_pdf_viewer