scrape!
by
noromanba
2015-12-27 [2015/12/27 11:45:28]
(Forked from
scrape by
noromanba)
download current html snapshot w/ abs path
@@ -1,8 +1,9 @@
/*
- * @title scrape
- * @description download current html snapshot
+ * @title scrape!
+ * @description download current html snapshot w/ abs path
* @include http://*
* @include https://*
+ * @contributor noromanba http://let.hatelabo.jp/noromanba/let/hJmcu63OhsMl (Fork of)
* @license MIT License http://opensource.org/licenses/MIT
* @javascript_url
*/
@@ -31,8 +32,28 @@
.replace(PUNCT,'_').replace(/(?:_){2,}/g, '_') + '.html';
pool.download = filename;
+ const sham = document.cloneNode(true);
+ Array.from(sham.querySelectorAll([
+ '[src^="/"]:not([src^="//"])',
+ '[src^="."]',
+ '[href^="/"]:not([href^="//"])',
+ '[href^="."]'
+ ])).forEach(node => {
+ // dirty-hack: replace rel-path to abs-path
+ // if you need rel-path by DOM, use node.getAttribute('src') c.f.
+ // http://subtech.g.hatena.ne.jp/secondlife/20090624/1245809935
+ if (node.src) {
+ node.src = node.src;
+ return;
+ }
+ if (node.href) {
+ node.href = node.href;
+ return;
+ }
+ });
+
// get <!DOCTYPE> et al.
- let root = document.documentElement;
+ let root = sham.documentElement;
let buff = [];
while (root.previousSibling) {
buff.unshift(root.previousSibling);
@@ -44,11 +65,11 @@
// http://caniuse.com/#feat=xml-serializer
const snapshot = buff.map(node => {
return new XMLSerializer().serializeToString(node);
- }).join('\n') + document.documentElement.outerHTML;
+ }).join('\n') + sham.documentElement.outerHTML;
const snapURL = window.URL.createObjectURL(new Blob([snapshot], {
// FIXME always save UTF-8
- type: document.contentType + ';charset=' + document.characterSet
+ type: sham.contentType + ';charset=' + sham.characterSet
}));
pool.href = snapURL;
/*
* @title scrape!
* @description download current html snapshot w/ abs path
* @include http://*
* @include https://*
* @contributor noromanba http://let.hatelabo.jp/noromanba/let/hJmcu63OhsMl (Fork of)
* @license MIT License http://opensource.org/licenses/MIT
* @javascript_url
*/
// via
// http://blog.mudatobunka.org/entry/2015/12/23/211425
// thx id:todays_mitsui
// c.f.
// http://h.hatena.ne.jp/noromanba/81805704781216737
(() => {
'use strict';
const pool = document.createElement('a');
// POSIX class [:punct:] not impl yet, alt expanded ASCII c.f.
// http://www.regular-expressions.info/posixbrackets.html#class
// [!"#$%&'()*+,\-.:;<=>?@[\\\]^_`{|}~] minus "-._" plus "\s"
const PUNCT = /[!"#$%&'()*+,:;<=>?@[\\\]^`{|}~\s]/g;
const filename = decodeURIComponent([
location.hostname,
location.pathname.split('/').slice(1).join('_'),
location.search,
location.hash
].filter(s => !!s).join('_'))
.replace(PUNCT,'_').replace(/(?:_){2,}/g, '_') + '.html';
pool.download = filename;
const sham = document.cloneNode(true);
Array.from(sham.querySelectorAll([
'[src^="/"]:not([src^="//"])',
'[src^="."]',
'[href^="/"]:not([href^="//"])',
'[href^="."]'
])).forEach(node => {
// dirty-hack: replace rel-path to abs-path
// if you need rel-path by DOM, use node.getAttribute('src') c.f.
// http://subtech.g.hatena.ne.jp/secondlife/20090624/1245809935
if (node.src) {
node.src = node.src;
return;
}
if (node.href) {
node.href = node.href;
return;
}
});
// get <!DOCTYPE> et al.
let root = sham.documentElement;
let buff = [];
while (root.previousSibling) {
buff.unshift(root.previousSibling);
root = root.previousSibling;
}
// dump <!DOCTYPE> et al. and <html> tree, old new thing; c.f.
// https://developer.mozilla.org/en-US/docs/XMLSerializer
// http://caniuse.com/#feat=xml-serializer
const snapshot = buff.map(node => {
return new XMLSerializer().serializeToString(node);
}).join('\n') + sham.documentElement.outerHTML;
const snapURL = window.URL.createObjectURL(new Blob([snapshot], {
// FIXME always save UTF-8
type: sham.contentType + ';charset=' + sham.characterSet
}));
pool.href = snapURL;
pool.click();
window.URL.revokeObjectURL(snapURL);
})();
// NOTICE
// XMLSerializer().serializeToString() sometimes too much escape TextNode e.g.
// <script> 1 && 1; 1 < 1;</script> -> <script> 1 && 1 < 1 </script>
- Permalink
- このページへの個別リンクです。
- RAW
- 書かれたコードへの直接のリンクです。
- Packed
- 文字列が圧縮された書かれたコードへのリンクです。
- Userscript
- Greasemonkey 等で利用する場合の .user.js へのリンクです。
- Loader
- @require やソースコードが長い場合に多段ロードする Loader コミのコードへのリンクです。
- Metadata
- コード中にコメントで @xxx と書かれたメタデータの JSON です。