elisp-vcs/emacs-w3m/w3m-filter.el
2010-08-17 08:59:01 +02:00

368 lines
12 KiB
EmacsLisp
Raw Permalink Blame History

;;; w3m-filter.el --- filtering utility of advertisements on WEB sites -*- coding: euc-japan -*-
;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
;; TSUCHIYA Masatoshi <tsuchiya@namazu.org>
;; Authors: TSUCHIYA Masatoshi <tsuchiya@namazu.org>
;; Keywords: w3m, WWW, hypermedia
;; This file is a part of emacs-w3m.
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING. If not, write to
;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
;;; Commentary:
;; w3m-filter.el is the add-on utility to filter advertisements on WEB
;; sites.
;;; Code:
(provide 'w3m-filter)
(eval-when-compile
(require 'cl))
(require 'w3m)
(defcustom w3m-filter-rules
`(("\\`http://www\\.geocities\\.co\\.jp/"
w3m-filter-delete-regions
"<DIV ALIGN=CENTER>\n<!--*/GeoGuide/*-->" "<!--*/GeoGuide/*-->\n</DIV>")
("\\`http://[a-z]+\\.hp\\.infoseek\\.co\\.jp/"
w3m-filter-delete-regions
"<!-- start AD -->" "<!-- end AD -->")
("\\`http://linux\\.ascii24\\.com/linux/"
w3m-filter-delete-regions
"<!-- DAC CHANNEL AD START -->" "<!-- DAC CHANNEL AD END -->")
("\\`http://\\(www\\|images\\|news\\|maps\\|groups\\)\\.google\\."
w3m-filter-google)
("\\`https?://\\(?:www\\.\\)?amazon\\.\
\\(?:com\\|co\\.\\(?:jp\\|uk\\)\\|fr\\|de\\)/"
w3m-filter-amazon)
("\\`https?://mixi\\.jp" w3m-filter-mixi)
("\\`http://eow\\.alc\\.co\\.jp/[^/]+/UTF-8" w3m-filter-alc)
("\\`http://www\\.asahi\\.com/" w3m-filter-asahi-shimbun)
("\\`http://imepita\\.jp/[0-9]+/[0-9]+" w3m-filter-imepita)
("\\`http://allatanys\\.jp/" w3m-filter-allatanys)
("\\`http://.*\\.wikipedia\\.org/" w3m-filter-wikipedia)
("" w3m-filter-iframe))
"Rules to filter advertisements on WEB sites."
:group 'w3m
:type '(repeat
(cons :format "%v" :indent 4
(regexp :format "Regexp: %v\n" :size 0)
(choice
:tag "Filtering Rule"
(list :tag "Delete regions surrounded with these patterns"
(function-item :format "" w3m-filter-delete-region)
(regexp :tag "Start")
(regexp :tag "End"))
(list :tag "Filter with a user defined function"
function
(repeat :tag "Arguments" sexp))))))
(defcustom w3m-filter-google-use-utf8
(or (featurep 'un-define) (fboundp 'utf-translate-cjk-mode)
(and (not (equal "Japanese" w3m-language))
(w3m-find-coding-system 'utf-8)))
"*Use the converting rule to UTF-8 on the site of Google."
:group 'w3m
:type 'boolean)
(defcustom w3m-filter-google-use-ruled-line t
"*Use the ruled line on the site of Google."
:group 'w3m
:type 'boolean)
(defcustom w3m-filter-google-separator "<hr>"
"Field separator for Google's search results ."
:group 'w3m
:type 'string)
(defcustom w3m-filter-amazon-regxp
(concat
"\\`\\(https?://\\(?:www\\.\\)?amazon\\."
"\\(?:com\\|co\\.\\(?:jp\\|uk\\)\\|fr\\|de\\)"
;; "Joyo.com"
"\\)/"
"\\(?:"
"\\(?:exec/obidos\\|o\\)/ASIN"
"\\|"
"gp/product"
"\\|"
"\\(?:[^/]+/\\)?dp"
"\\)"
"/\\([0-9]+\\)")
"*Regexp to extract ASIN number for Amazon."
:group 'w3m
:type '(string :size 0))
(defcustom w3m-filter-amazon-short-url-bottom nil
"*Amazon short URLs insert bottom position."
:group 'w3m
:type 'boolean)
;;;###autoload
(defun w3m-filter (url)
"Apply filtering rule of URL against a content in this buffer."
(save-match-data
(dolist (elem w3m-filter-rules)
(when (string-match (car elem) url)
(apply (cadr elem) url (cddr elem))))))
(defun w3m-filter-delete-regions (url start end)
"Delete regions surrounded with a START pattern and an END pattern."
(goto-char (point-min))
(let (p (i 0))
(while (and (search-forward start nil t)
(setq p (match-beginning 0))
(search-forward end nil t))
(delete-region p (match-end 0))
(incf i))
(> i 0)))
(defun w3m-filter-replace-regexp (url regexp to-string)
"Replace all occurrences of REGEXP with TO-STRING."
(goto-char (point-min))
(while (re-search-forward regexp nil t)
(replace-match to-string nil nil)))
;; Filter functions:
(defun w3m-filter-asahi-shimbun (url)
"Convert entity reference of UCS."
(when w3m-use-mule-ucs
(goto-char (point-min))
(let ((case-fold-search t)
end ucs)
(while (re-search-forward "alt=\"\\([^\"]+\\)" nil t)
(goto-char (match-beginning 1))
(setq end (set-marker (make-marker) (match-end 1)))
(while (re-search-forward "&#\\([0-9]+\\);" (max end (point)) t)
(setq ucs (string-to-number (match-string 1)))
(delete-region (match-beginning 0) (match-end 0))
(insert-char (w3m-ucs-to-char ucs) 1))))))
(defun w3m-filter-google (url)
"Insert separator within items."
(goto-char (point-min))
(let ((endm (make-marker))
(case-fold-search t)
pos beg end)
(when (and w3m-filter-google-use-utf8
(re-search-forward "\
<a class=. href=\"http://\\(www\\|images\\|news\\|maps\\|groups\\)\\.google\\."
nil t)
(setq pos (match-beginning 0))
(search-backward "<table" nil t)
(setq beg (match-beginning 0))
(search-forward "</table" nil t)
(set-marker endm (match-end 0))
(< pos (marker-position endm)))
(goto-char beg)
(while (re-search-forward "[?&][io]e=\\([^&]+\\)&" endm t)
(replace-match "UTF-8" nil nil nil 1))
(setq end (marker-position endm)))
(when (string-match "\\`http://www\\.google\\.[^/]+/search\\?" url)
(goto-char (point-max))
(when (and w3m-filter-google-use-ruled-line
(search-backward "<div class=" end t)
(search-forward "</div>" nil t))
(insert w3m-filter-google-separator))
(if w3m-filter-google-use-ruled-line
(while (search-backward "<div class=" end t)
(insert w3m-filter-google-separator))
(while (search-backward "<div class=" end t)
(insert "<p>"))))))
(defun w3m-filter-amazon (url)
"Insert Amazon short URIs."
(when (string-match w3m-filter-amazon-regxp url)
(let* ((base (match-string 1 url))
(asin (match-string 2 url))
(shorturls `(,(concat base "/dp/" asin "/")
,(concat base "/o/ASIN/" asin "/")
,(concat base "/gp/product/" asin "/")))
(case-fold-search t)
shorturl)
(goto-char (point-min))
(setq url (file-name-as-directory url))
(when (or (and (not w3m-filter-amazon-short-url-bottom)
(search-forward "<body" nil t)
(search-forward ">" nil t))
(and w3m-filter-amazon-short-url-bottom
(search-forward "</body>" nil t)
(goto-char (match-beginning 0))))
(insert "\n")
(while (setq shorturl (car shorturls))
(setq shorturls (cdr shorturls))
(unless (string= url shorturl)
(insert (format "Amazon Short URL: <a href=\"%s\">%s</a><br>\n"
shorturl shorturl))))
(insert "\n")))))
(defun w3m-filter-mixi (url)
"Direct jump to the external diary."
(goto-char (point-min))
(let (newurl)
(while (re-search-forward "<a href=\"?view_diary\\.pl\\?url=\\([^>]+\\)>"
nil t)
(setq newurl (match-string 1))
(when newurl
(delete-region (match-beginning 0) (match-end 0))
(when (string-match "&owner_id=[0-9]+\"?\\'" newurl)
(setq newurl (substring newurl 0 (match-beginning 0))))
(insert (format "<a href=\"%s\">"
(w3m-url-readable-string newurl)))))))
(defun w3m-filter-alc (url)
(let ((baseurl "http://eow.alc.co.jp/%s/UTF-8/")
curl cword beg tmp1)
(when (string-match "\\`http://eow\\.alc\\.co\\.jp/\\([^/]+\\)/UTF-8/" url)
(setq curl (match-string 0 url))
(setq cword (match-string 1 url))
(setq cword (car (split-string (w3m-url-decode-string cword 'utf-8)
" ")))
(goto-char (point-min))
(while (search-forward "<EFBFBD>ǡ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ž<EFBFBD>ܤ϶ؤ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƥ<EFBFBD><EFBFBD>ޤ<EFBFBD>" nil t)
(delete-region (line-beginning-position) (line-end-position))
(insert "<br>"))
(goto-char (point-min))
(when (search-forward "<body" nil t)
(forward-line 1)
(insert "<h1><3E>Ѽ<EFBFBD>ϯ on the WEB<h1>\n")
(setq beg (point))
(when (search-forward "<!-- <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʸ<EFBFBD><CAB8><EFBFBD><EFBFBD> -->" nil t)
(forward-line 1)
(delete-region beg (point)))
(when (search-forward "<!-- <20><><EFBFBD><EFBFBD>ɥ<EFBFBD><C9A5><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD> -->" nil t)
(forward-line 1)
(setq beg (point))
(when (search-forward "</body>" nil t)
(delete-region beg (match-beginning 0))))
(insert "<br><3E><><EFBFBD>ǡ<EFBFBD><C7A1><EFBFBD><EFBFBD><EFBFBD>ž<EFBFBD>ܤ϶ؤ<CFB6><D8A4><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƥ<EFBFBD><C6A4>ޤ<EFBFBD><DEA4><EFBFBD>")
;; next/previous page
(goto-char (point-min))
(while (re-search-forward
"<a href='javascript:goPage(\"\\([0-9]+\\)\")'>"
nil t)
(setq tmp1 (match-string 1))
(delete-region (match-beginning 0) (match-end 0))
(insert (format "<a href=\"%s?pg=%s\">" curl tmp1)))
;; wordlink
(goto-char (point-min))
(while (re-search-forward
"<span class=\"wordlink\">\\([^<]+\\)</span>"
nil t)
(setq tmp1 (match-string 1))
(delete-region (match-beginning 0) (match-end 0))
(insert (format "<a href=\"%s\">%s</a>" (format baseurl tmp1) tmp1)))
;; goGradable/goFairWord
(goto-char (point-min))
(while (re-search-forward
"<a href='javascript:\\(goGradable\\|goFairWord\\)(\"\\([^\"]+\\)\")'>"
nil t)
(setq tmp1 (match-string 2))
(delete-region (match-beginning 0) (match-end 0))
(insert (format "<a href=\"%s\">" (format baseurl tmp1))))
;; remove spacer
(goto-char (point-min))
(while (search-forward "img/spacer.gif" nil t)
(delete-region (line-beginning-position) (line-end-position)))
(goto-char (point-min))
;; remove <20><EFBFBD>ɥ<EFBFBD><C9A5><EFBFBD><EFBFBD><EFBFBD>
(when (search-forward "alt=\"<EFBFBD><EFBFBD>ɥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\"" nil t)
(delete-region (line-beginning-position) (line-end-position)))
;; <20><>ʸ<EFBFBD><CAB8>ɽ<EFBFBD><C9BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̵<EFBFBD><CCB5>
(goto-char (point-min))
(while (re-search-forward
(concat "<br */> *<2A><><strong>"
"<a href='javascript:goFullText(\"[^\"]+\", \"[^\"]+\")'>"
"<EFBFBD><EFBFBD>ʸ<EFBFBD><EFBFBD>ɽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD></a>")
nil t)
(delete-region (match-beginning 0) (match-end 0)))
;; Java Document write... ;_;
;; (while (re-search-forward
;; "<a href='javascript:goFullText(\"\\([^\"]+\\)\", \"\\([^\"]+\\)\")'>"
;; nil t)
;; (setq tmp1 (match-string 1))
;; (setq tmp2 (match-string 2))
;; (delete-region (match-beginning 0) (match-end 0))
;; ;; &dk=JE, &dk=EJ
;; (insert (format "<a href=\"%s?ref=ex&exp=%s&dn=%s&dk=%s\">"
;; curl tmp1 tmp2
;; (if (string-match "\\Cj" cword) "JE" "EJ"))))
))))
(defun w3m-filter-imepita (url)
"JavaScript emulation."
(goto-char (point-min))
(let (tmp)
(when (re-search-forward
(concat "<script><!--\ndocument.write('\\([^\n]*\\)');\r\n//--></script>\n"
"<noscript>.*</noscript>")
nil t)
(setq tmp (match-string 1))
(delete-region (match-beginning 0) (match-end 0))
(insert tmp))))
(defun w3m-filter-iframe (url)
(goto-char (point-min))
(while (re-search-forward "<iframe [^>]*src=\"\\([^\"]*\\)\"[^>]*>" nil t)
(insert (concat "[iframe:<a href=\"" (match-string 1) "\">" (match-string 1) "</a>]"))))
(defun w3m-filter-allatanys (url)
"JavaScript emulation."
(goto-char (point-min))
(let (aturl atexpurl)
(if (re-search-forward
(concat "<body[ \t\r\f\n]+onload=\"window\\.top\\.location\\.replace('"
w3m-html-string-regexp
"');\">")
nil t)
(progn
(setq aturl (match-string 1))
(setq atexpurl (w3m-expand-url aturl url))
(delete-region (match-beginning 0) (match-end 0))
(insert "<body>\n"
"<hr>"
"Body has a <b>url=window.top.location.replace()</b><br><br>\n"
(format "Goto: <a href=%s>%s</a>\n" aturl atexpurl)
"<hr>")
(goto-char (point-min))
(insert (format "<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0;URL=%s\">\n"
aturl)))
(while (re-search-forward (concat "<a[ \t\r\l\n]+href=\"javascript:[^(]+('"
"\\([^']+\\)')\">")
nil t)
(setq aturl (match-string 1))
(delete-region (match-beginning 0) (match-end 0))
(insert (format "<a href=\"%s\">" aturl))))))
(defun w3m-filter-wikipedia (url)
"Make anchor reference to work."
(goto-char (point-min))
(let (matched-text refid)
(while (re-search-forward
"<\\(?:sup\\|cite\\) id=\"\\([^\"]*\\)\"" nil t)
(setq matched-text (match-string 0)
refid (match-string 1))
(delete-region (match-beginning 0) (match-end 0))
(insert (format "<a name=\"%s\"></a>%s" refid matched-text)))))
;;; w3m-filter.el ends here