scrape!

    @@ -1,8 +1,9 @@ /* - * @title scrape - * @description download current html snapshot + * @title scrape! + * @description download current html snapshot w/ abs path * @include http://* * @include https://* + * @contributor noromanba http://let.hatelabo.jp/noromanba/let/hJmcu63OhsMl (Fork of) * @license MIT License http://opensource.org/licenses/MIT * @javascript_url */ @@ -31,8 +32,28 @@ .replace(PUNCT,'_').replace(/(?:_){2,}/g, '_') + '.html'; pool.download = filename; + const sham = document.cloneNode(true); + Array.from(sham.querySelectorAll([ + '[src^="/"]:not([src^="//"])', + '[src^="."]', + '[href^="/"]:not([href^="//"])', + '[href^="."]' + ])).forEach(node => { + // dirty-hack: replace rel-path to abs-path + // if you need rel-path by DOM, use node.getAttribute('src') c.f. + // http://subtech.g.hatena.ne.jp/secondlife/20090624/1245809935 + if (node.src) { + node.src = node.src; + return; + } + if (node.href) { + node.href = node.href; + return; + } + }); + // get <!DOCTYPE> et al. - let root = document.documentElement; + let root = sham.documentElement; let buff = []; while (root.previousSibling) { buff.unshift(root.previousSibling); @@ -44,11 +65,11 @@ // http://caniuse.com/#feat=xml-serializer const snapshot = buff.map(node => { return new XMLSerializer().serializeToString(node); - }).join('\n') + document.documentElement.outerHTML; + }).join('\n') + sham.documentElement.outerHTML; const snapURL = window.URL.createObjectURL(new Blob([snapshot], { // FIXME always save UTF-8 - type: document.contentType + ';charset=' + document.characterSet + type: sham.contentType + ';charset=' + sham.characterSet })); pool.href = snapURL;
  • /*
     * @title scrape!
     * @description download current html snapshot w/ abs path
     * @include http://*
     * @include https://*
     * @contributor noromanba http://let.hatelabo.jp/noromanba/let/hJmcu63OhsMl (Fork of)
     * @license MIT License http://opensource.org/licenses/MIT
     * @javascript_url
     */
    
    // via
    // http://blog.mudatobunka.org/entry/2015/12/23/211425
    // thx id:todays_mitsui
    
    // c.f.
    // http://h.hatena.ne.jp/noromanba/81805704781216737
    
    (() => {
        'use strict';
    
        const pool = document.createElement('a');
        // POSIX class [:punct:] not impl yet, alt expanded ASCII c.f.
        // http://www.regular-expressions.info/posixbrackets.html#class
        // [!"#$%&'()*+,\-.:;<=>?@[\\\]^_`{|}~] minus "-._" plus "\s"
        const PUNCT = /[!"#$%&'()*+,:;<=>?@[\\\]^`{|}~\s]/g;
        const filename = decodeURIComponent([
            location.hostname,
            location.pathname.split('/').slice(1).join('_'),
            location.search,
            location.hash
        ].filter(s => !!s).join('_'))
        .replace(PUNCT,'_').replace(/(?:_){2,}/g, '_') + '.html';
        pool.download = filename;
    
        const sham = document.cloneNode(true);
        Array.from(sham.querySelectorAll([
            '[src^="/"]:not([src^="//"])',
            '[src^="."]',
            '[href^="/"]:not([href^="//"])',
            '[href^="."]'
        ])).forEach(node => {
            // dirty-hack: replace rel-path to abs-path
            // if you need rel-path by DOM, use node.getAttribute('src') c.f.
            // http://subtech.g.hatena.ne.jp/secondlife/20090624/1245809935
            if (node.src) {
                node.src = node.src;
                return;
            }
            if (node.href) {
                node.href = node.href;
                return;
            }
        });
    
        // get <!DOCTYPE> et al.
        let root = sham.documentElement;
        let buff = [];
        while (root.previousSibling) {
            buff.unshift(root.previousSibling);
            root = root.previousSibling;
        }
    
        // dump <!DOCTYPE> et al. and <html> tree, old new thing; c.f.
        // https://developer.mozilla.org/en-US/docs/XMLSerializer
        // http://caniuse.com/#feat=xml-serializer
        const snapshot = buff.map(node => {
          return new XMLSerializer().serializeToString(node);
        }).join('\n') + sham.documentElement.outerHTML;
    
        const snapURL = window.URL.createObjectURL(new Blob([snapshot], {
            // FIXME always save UTF-8
            type: sham.contentType + ';charset=' + sham.characterSet
        }));
        pool.href = snapURL;
    
        pool.click();
        window.URL.revokeObjectURL(snapURL);
    })();
    
    // NOTICE
    // XMLSerializer().serializeToString() sometimes too much escape TextNode e.g.
    // <script> 1 && 1; 1 < 1;</script> -> <script> 1 &amp;&amp; 1 &lt; 1 </script>
    
    
  • Permalink
    このページへの個別リンクです。
    RAW
    書かれたコードへの直接のリンクです。
    Packed
    文字列が圧縮された書かれたコードへのリンクです。
    Userscript
    Greasemonkey 等で利用する場合の .user.js へのリンクです。
    Loader
    @require やソースコードが長い場合に多段ロードする Loader コミのコードへのリンクです。
    Metadata
    コード中にコメントで @xxx と書かれたメタデータの JSON です。

History

  1. 2015/12/27 11:45:28 - 2015-12-27
  2. 2015/12/27 11:19:11 - 2015-12-27
  3. 2015/12/27 11:16:01 - 2015-12-27