diff --git a/zxxk_dl/json.drawio b/zxxk_dl/json.drawio new file mode 100644 index 0000000..747e1b1 --- /dev/null +++ b/zxxk_dl/json.drawio @@ -0,0 +1 @@ +7V1bc9o4FP41etiHMLbk62MgpMlO2+lsHrbdNxUEeGss1lYC9NevjCXAWDYOxsQGk5kGH10s6ZzznYsUFaDBfPUpxIvZFzomPoDaeAXQA4BQdxyb/4opa0HRHS2hTENvLGg7wov3mwiirPbqjUmUqsgo9Zm3SBNHNAjIiKVoOAzpMl1tQv30Wxd4SjKElxH2s9S/vTGbyWlY7q7giXjTmXi1A8WM51hWFjOJZnhMl3skNARoEFLKkm/z1YD48erJdUnaPeaUbgcWkoCVabA0/3x7Hj19iVb/rKD9CT9P2ac7JKbxhv1XMWMxWraWSzAN6etCVCMhIyvVwuOfsrqWHZi+nS4XFELnhIVrXkV2ZDtJEyEjjuh5uVtvRzcS2mxvqS3LEnwWPJ5uu94tA/8iVuIdqwKtzCKQMRcL8UhDNqNTGmB/uKP2+SoFYxJ3q/GnXZ3PlC44UefEfwljayHj+JVRTpqxuS9KJzRgj3ju+fE6PBH/jTBvhEWBaAU18TygPg03Q0Pa5sPpJBjfxyLPyQENSEJ59OK5b8aUTCqeSa4IFQpJRF/DESmomMPnkPiYeW/pt6qYJpp+ox4fz1Y+IDJ6aP9jpMTFRFpPUSr7ZzicEia63MkFXyi83qu2iCtEJw8C2mZP2y+2DoQwed9OJLfLU0FK7YzuTrAfEaXsfsY/OTSn5A373jTg30ec+4TLUj/Wbi5x/r0omHvjcSLaJPJ+Cw2PBUksFu/c7APzIUdEc2VZJbuV8KVAiXNR507r6doB8twJ8DtVYLdGLt2CTiYRYeAQlc4gAroCvS2fiXVOSYH13yuVBXfRhlX3vALSFqvN8sty/m0a/36O/sKh7I0PLukwKctI2A77YsFazjxGXhZ4gxVL7hik5S6X1SWBKMtSqYTISSshyhgS3XCyhsQ28jlcyY6YTTUjup1VReMh/tkYDT7372IAm4cf8UPPlI8Pq/3Ch/WZzEuC1AUVcxS6onkxDdiz9j52GhJ0mMb9kvYl8xpDujVSWA0r9VrpUMp+E2Nb2W4ZllMwu5xB5M7BRenFqLM3S093Zhd2ZurOGVtDI93acuo35zr6CLQ4VdlPRRmVszrycRR5o5S/ql8CUPR6HNZO1Vuk6jpEKse+Xs/9Q1S9+Sp7fttu2arYULDe0OyetvfR7c60X0Tf+cLXr2SC14rYaOy9lQmNdFuERprvBeRORgybIjcnajoIv0ZbLdpVRMY4/tknvWMshy8McfgtJG8eWT4HE5obr23Jm7mnqde0HGD4CPoD4DhsvRB1cSxaf7xzZarEtxk45fAo4VTk5cDZQ2DRDYJaASborq1MYn1YgJxNNl/IDopCtHVsT3CBycpj36Ubzb/vteJPu0bxw7li5HIp2BKur1GLvT0ifsjtIUVxk1K17+5NXVyrZUO5li1a4KAUfpo5+MlC3qXo7GeoQMnkDTkwedlEM6opb1wIsFpPc10BkVWVRUtJ0h100j3Ulza2TnONJiJ4iCVoQIOI8iAAQD4EPhELz2MzKKoPOEx5nNVQ+0qWh4VJkzkNaJTY0Njkn9/ZOMWJUHsoIgDKvPCFTthXPCcX9C38WLv6ePRruml24FZUTv1e0lXJic32fBHXUQX2Kc/EcA+ys3VtCWf32q5QZUpKfyX/3AKuBRwbDA3Q74O+BYYmcDXgPHRadIoW2brZIi1yFFrU7kTYxxy0sEp6+XZFL78Ss/X83ekrwsz63YynWHI7cDyfi7EuBMaGQqcOb0GbLuGBuOAeAscAQwfc28BxYoqLQH8YOyfOENybnbad0RVpp7ZpN+2pHE8zls0zStiqfwewGr+NNrG3rqNax7lpX4qbp5zVgppbuO9aej/3pL3Qbnt325uDFCMvffaqWmsB1PUe1fqQg50tPKp1FFBKZ+VrApTu6EaLdFu9u1ivpt9E3rfbKml2ZFWcxyhxqqNRoZUqCXx1KtVtpbROy4rzF23TMtUef3si3PoTGHZJD1XCVcMTGFCVsLo6VO02W5oLn8VOCnStRuMlvInNym57pb36VeyeNF2/9BIXjdyyPyLt9/GM2cX+pKoanpq5eFrWlsO8qwnAAIK+/e5/S95lwPWWgdrOL8ciGQk5448RC+kvcghLWVSpfL/GpgMxSWnqLu0DIMfpmRmUkhdC7cOSPBp9frFUwdDVmfkun9dsU47kzSTyCiMZZDbUeKObCC67lF3rFOlw+67pigRv+wD0cS+47M0d6GL7xtWAE94CcHZZueYi5KGrYemW6uRBQ/ES5f+p6xXpT5eVa69+HXogLdMvVda780cy9vu4P4La4Y+ojjMd8lt12m+PWWlul2HdkUuLT7x946hY5jD9fNfHwRxe7im7qUhuSVrN1yJDo6dnYebIQcbMWyzpksscHnKLur3MFR6mCwsHrT5meKbWyiPo9d73USZq7LS2LFTD69fa7W6g7BZZMvN+O5pqGbpiM+JETeWPu/+qIam++x8v0PB/ \ No newline at end of file diff --git a/zxxk_dl/main.py b/zxxk_dl/main.py index 32cc118..eff3a92 100644 --- a/zxxk_dl/main.py +++ b/zxxk_dl/main.py @@ -1,5 +1,4 @@ import requests -from time import strftime from re import findall, sub from hashlib import md5 diff --git a/zxxk_dl/process.md b/zxxk_dl/process.md new file mode 100644 index 0000000..60f3ccc --- /dev/null +++ b/zxxk_dl/process.md @@ -0,0 +1,213 @@ +# 对于某网站预览文档分析 + +Released in [52pojie](https://www.52pojie.cn/thread-1757323-1-1.html) + +## 背景 + +某网站资源页面存在预览,有的只有部分,有的能显示全部。对于文字版文档,文件是矢量版,看起来效果很好。 + +## 预览数据抓包 + +在页面内打开一个没有打开的文档,抓包找到大量svg文件请求: + +链接(已隐藏部分数据): + +```url +https://preview.*****.com/resource/oss/preview/rbm-preview-product/rbm/*******/svg/6.svg?Expires=167*****11&Signature=qK47**********rYq43G6Q%3D +``` + +将抓包数据导入apifox,删除cookie和header测试是否存在验证,结果证明不存在除了url param外的其他认证。 + +## 寻找根源请求数据 + +上文已经找到了每一页预览的请求,但是由于`Signature`的存在还是无法实现自动请求。刷新页面,搜索上述链接可能的关键词`preview`。 + +观察到有一个疑似请求: + +```url +https://www.****.com/soft/Preview/FirstLoadPreviewJson?softID=33****78&fileaddress=&type=3&product=1&v=2&FullPreview=false +``` + +响应内容是一个json: + +```json +{ + "code": 0, + "success": false, + "data": { + "IsSuccess": true, + "IsRar": true, + "SoftExt": null, + "Html": "\r\n\u003cstyle\u003e\r\n {{请求内容过长忽略}} \u003e\r\nHHHHHHisrar", + "PreviewPage": 0, + "TotalPage": 0, + "rarPreviewInfo": [ + { + "FileId": 38****99, + "Html": "\r\n\u003cdiv class=\"multiple-date-preview-file\" data-index=\"3****499\"\u003e\r\n \u003cdiv class=\"preview-main\"\u003e\r\n \r\n \u003cimg data-original=\"https://preview.***.com/resource/oss/preview/rbm-preview-product/rbm/38****99/svg/1.svg?Expires=1678604711\u0026Signature=7w%2BPVktPi********5Lha0%3D\" onselectstart=\"return false\" /\u003e\u003cimg data-original=\"https://preview.***.com/resource/oss/preview/rbm-preview-product/rbm/38****99/svg/2.svg?Expires=16****4711\u0026Signature=zt20********BQ1RGZDlTaB0%3D\" onselectstart=\"return false\" /\u003e\u003c {{以下存在类似内容,忽略}} \u003c/div\u003e\r\n\u003c/div\u003e\r\n\u003cscript\u003e\r\n document.oncontextmenu = function () {\r\n window.event.returnValue = false;\r\n };\r\n var pageNum = 9;\r\n var hasPreview = true;\r\n if (pageNum \u0026\u0026 window.setDocumentPageNum) {\r\n window.setDocumentPageNum(pageNum, hasPreview);\r\n }\r\n\u003c/script\u003e\r\n\r\n", + "SoftName": "************.docx", + "IconClassName": "icon-doc-doc", + "TotalPage": 9, + "PreviewPage": 9, + "IsMedia": false, + "Url": null + }, + { + "FileId": 3****500, + "Html": "{{类似上文}}", + "SoftName": "********.docx", + "IconClassName": "icon-doc-doc", + "TotalPage": 5, + "PreviewPage": 5, + "IsMedia": false, + "Url": null + } + ] + }, + "msg": "预览错误", + "message": null +} +``` + +PS:上文中的`"msg": "预览错误"`对任意都相同,忽略。 + +经分析,上文的第一个`Html`内容基本无效,都时限制用户的脚本。 +`rarPreviewInfo`中的`Html`内容格式化后如下: + +```html +
+
+ + + + +
+
+ +``` + +显然,preview链接被直接返回了,经检验该链接可以直接请求。 +最后导入apifox验证后可以确定无需特殊headers或cookies。 + +## 最小化请求链接和寻找相关参数 + +当前链接参数: + +SoftID可以直接在文档页面的链接中可以找到: + +```url +https://www.****.com/soft/33****78.html +``` + +上文的`33****78`即为softID + +`type`, `product`, `v`参数不知道用途,对任意文档都是相同的,疑似网站作者还没有实现,直接引用原文了。 + +`FullPreview`原本是`false`,改为`true`就直接返回全部内容的预览了..... + +## 分析返回的json + +测试了多个文档后确定一下结论: + +## demo代码 + +**代码无法运行,因为关键域名被"\*"替代** + +```python +import requests +from re import findall, sub +from hashlib import md5 + +HTML_FORMAT =''' + + + +{title} + + + +{body} + + + +''' + +def writefile(filename,text): + filename = sub(r"""[\*\/\\\|\<\>\? \:\.\'\"\!]""", "", filename) + unique = md5(text.encode()) + filename += "_"+unique.hexdigest()[:5] + filename+=".html" + print("Writing "+filename) + # print("-=-=-=-=\n",text,"\n-=-=-=-=") + with open(filename+'.html', 'w', encoding="utf-8") as f: + f.write(text) + + +def main(): + softID=input("ID: ") + url = "https://www.*******.com/soft/Preview/FirstLoadPreviewJson?softID={}&type=3&product=1&v=2&FullPreview=true" + response = requests.get(url.format(softID)) + if response.status_code!=200: + print("ERROR") + print(response.status_code) + return -1 + ret=response.json()["data"] + if not ret["IsSuccess"]: + print("ERROR: IsSuccess option is not true") + print(ret) + if not ret['IsRar']: + print("Not rar") + print("TotalPage=%d" % ret['TotalPage']) + print("SoftExt=%s" % ret['SoftExt']) + try: + html=ret["Html"] + print(ret) + except: + print(ret) + exit(1) + # replace "data-original" to "src" for showing in browser + html=html.replace("data-original", "src") + writefile(f"{softID}",html) + else: + print("is RAR") + rar=ret['rarPreviewInfo'] + for file in rar: + html=file["Html"] + title=file["SoftName"] + # replace "data-original" to "src" for showing in browser + # html=html.replace("data-original", "src") + urls=findall("(?<=data-original=\")https://preview.*******.com/\\S+(?=\")",html) + l=[] + for url in urls: + if "jpg" in url: + l.append(f"") + continue + page=requests.get(url,cookies=response.cookies) + if not page.status_code==200: + print(page) + print(page.status_code) + print(page.text) + assert page.status_code==200 + l.append(page.text) + format_html=HTML_FORMAT.format(title=title,body="\n".join(l)) + writefile(title,format_html) + +if __name__ == "__main__": + while True: + main() +``` diff --git a/zxxk_dl/screeshot/1.png b/zxxk_dl/screeshot/1.png new file mode 100644 index 0000000..cadfd6b Binary files /dev/null and b/zxxk_dl/screeshot/1.png differ diff --git a/zxxk_dl/screeshot/2.png b/zxxk_dl/screeshot/2.png new file mode 100644 index 0000000..44b917f Binary files /dev/null and b/zxxk_dl/screeshot/2.png differ diff --git a/zxxk_dl/screeshot/3.png b/zxxk_dl/screeshot/3.png new file mode 100644 index 0000000..3ca7bd8 Binary files /dev/null and b/zxxk_dl/screeshot/3.png differ diff --git a/zxxk_dl/screeshot/4.png b/zxxk_dl/screeshot/4.png new file mode 100644 index 0000000..208738f Binary files /dev/null and b/zxxk_dl/screeshot/4.png differ diff --git a/zxxk_dl/screeshot/5.png b/zxxk_dl/screeshot/5.png new file mode 100644 index 0000000..208c4f2 Binary files /dev/null and b/zxxk_dl/screeshot/5.png differ