Author: chestnut
Date: Mon Jan 30 11:12:39 2006
New Revision: 373570
URL: http://svn.apache.org/viewcvs?rev=373570&view=rev
Log:
moved external linkcheck functionality to a module
Added:
lenya/trunk/src/modules/linkcheck/
lenya/trunk/src/modules/linkcheck/config/
lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/
lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/usecase-getLinks.xconf
(with props)
lenya/trunk/src/modules/linkcheck/java/
lenya/trunk/src/modules/linkcheck/java/src/
lenya/trunk/src/modules/linkcheck/java/src/org/
lenya/trunk/src/modules/linkcheck/java/src/org/apache/
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java
(with props)
lenya/trunk/src/modules/linkcheck/resources/
lenya/trunk/src/modules/linkcheck/resources/linkreporter.js (with props)
lenya/trunk/src/modules/linkcheck/usecases/
lenya/trunk/src/modules/linkcheck/usecases/linkcheck/
lenya/trunk/src/modules/linkcheck/usecases/linkcheck/getLinks.jx
lenya/trunk/src/modules/linkcheck/usecases/linkcheck/usecase.xmap (with
props)
Removed:
lenya/trunk/src/java/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java
lenya/trunk/src/pubs/default/resources/shared/javascript/linkreporter.js
lenya/trunk/src/pubs/default/usecase-linkreport.xmap
Modified:
lenya/trunk/src/pubs/default/sitemap.xmap
lenya/trunk/src/webapp/lenya/usecases/usecase.xmap
Added:
lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/usecase-getLinks.xconf
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/usecase-getLinks.xconf?rev=373570&view=auto
==============================================================================
---
lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/usecase-getLinks.xconf
(added)
+++
lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/usecase-getLinks.xconf
Mon Jan 30 11:12:39 2006
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<!--
+ Copyright 1999-2004 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<xconf xpath="/cocoon/usecases"
unless="/cocoon/usecases/component-instance[@name = 'linkcheck.getLinks']">
+
+ <component-instance name="linkcheck.getLinks" logger="lenya.linkcheck"
class="org.apache.lenya.cms.usecase.DummyUsecase">
+ <view template="modules/linkcheck/usecases/linkcheck/getLinks.jx"/>
+ </component-instance>
+
+</xconf>
\ No newline at end of file
Propchange:
lenya/trunk/src/modules/linkcheck/config/cocoon-xconf/usecase-getLinks.xconf
------------------------------------------------------------------------------
svn:eol-style = native
Added:
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java?rev=373570&view=auto
==============================================================================
---
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java
(added)
+++
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java
Mon Jan 30 11:12:39 2006
@@ -0,0 +1,695 @@
+/*
+ * Copyright 1999-2005 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lenya.cms.cocoon.generation;
+
+import org.apache.cocoon.generation.ServiceableGenerator;
+import org.apache.avalon.excalibur.pool.Recyclable;
+import org.apache.avalon.framework.parameters.Parameters;
+import org.apache.avalon.framework.configuration.Configurable;
+import org.apache.avalon.framework.configuration.Configuration;
+import org.apache.avalon.framework.configuration.ConfigurationException;
+import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.environment.ObjectModelHelper;
+import org.apache.cocoon.environment.Request;
+import org.apache.cocoon.environment.SourceResolver;
+import org.apache.cocoon.Constants;
+import org.apache.commons.lang.StringUtils;
+import org.apache.excalibur.source.Source;
+import org.apache.lenya.cms.publication.DocumentIdentityMap;
+import org.apache.lenya.cms.repository.RepositoryUtil;
+import org.apache.lenya.cms.repository.Session;
+import org.apache.regexp.RE;
+import org.apache.regexp.RESyntaxException;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URLConnection;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.Map;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ *Generates a list of links that are reachable from the src and their status.
+ *
+ * <pre>
+ * <map:generator name="linkStatus"
src="org.apache.lenya.cms.cocoon.generation.LinkStatusGenerator"/>
+ *
+ * <map:generate type="linkStatus" src="/{pubid}/{area}/{doc-id}.html">
+ * <map:parameter name="depth" value="1"/>
+ * </map:generate>
+ * </pre>
+**/
+
+public class LinkStatusGenerator extends ServiceableGenerator
+ implements Recyclable, Configurable {
+
+ /** The URI of the namespace of this generator. */
+ protected static final String URI =
+ "http://apache.org/cocoon/linkstatus/2.0";
+
+ /** The namespace prefix for this namespace. */
+ protected static final String PREFIX = "linkstatus";
+
+ /* Node and attribute names */
+ protected static final String TOP_NODE_NAME = "linkstatus";
+ protected static final String LINK_NODE_NAME = "link";
+
+ protected static final String HREF_ATTR_NAME = "href";
+ protected static final String REFERRER_ATTR_NAME = "referrer";
+ protected static final String CONTENT_ATTR_NAME = "content";
+ protected static final String STATUS_ATTR_NAME = "status";
+ protected static final String MESSAGE_ATTR_NAME = "message";
+
+ protected AttributesImpl attributes;
+
+ /**
+ * Config element name specifying expected link content-typ.
+ * <p>
+ * Its value is <code>link-content-type</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
+
+ /**
+ * Default value of <code>link-content-type</code> configuration value.
+ * <p>
+ * Its value is <code>application/x-cocoon-links</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final String LINK_CONTENT_TYPE_DEFAULT =
"application/x-cocoon-links";
+
+ /**
+ * Config element name specifying query-string appendend for requesting
links
+ * of an URL.
+ * <p>
+ * Its value is <code>link-view-query</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
+ /**
+ * Default value of <code>link-view-query</code> configuration value.
+ * <p>
+ * Its value is <code>?cocoon-view=links</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
+
+ /**
+ * Config element name specifying excluding regular expression pattern.
+ * <p>
+ * Its value is <code>exclude</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String EXCLUDE_CONFIG = "exclude";
+
+ /**
+ * Config element name specifying including regular expression pattern.
+ * <p>
+ * Its value is <code>include</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String INCLUDE_CONFIG = "include";
+
+ /**
+ * Config element name specifying http header value for user-Agent.
+ * <p>
+ * Its value is <code>user-agent</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String USER_AGENT_CONFIG = "user-agent";
+ /**
+ * Default value of <code>user-agent</code> configuration value.
+ *
+ * @see org.apache.cocoon.Constants#COMPLETE_NAME
+ * @since
+ */
+ public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
+
+ /**
+ * Config element name specifying http header value for accept.
+ * <p>
+ * Its value is <code>accept</code>.
+ * </p>
+ *
+ * @since
+ */
+ public final static String ACCEPT_CONFIG = "accept";
+ /**
+ * Default value of <code>accept</code> configuration value.
+ * <p>
+ * Its value is <code>* / *</code>
+ * </p>
+ *
+ * @since
+ */
+ public final static String ACCEPT_DEFAULT = "*/*";
+
+ private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
+ private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
+ private HashSet excludeCrawlingURL;
+ private HashSet includeCrawlingURL;
+
+ private HashSet crawled;
+ private HashSet linksToProcess;
+
+ /** The depth parameter determines how deep the
EnhancedLinkStatusGenerator should delve. */
+ protected int depth = 1;
+
+ protected Source inputSource;
+ String src;
+ private DocumentIdentityMap identityMap;
+
+ /**
+ * Stores links to process and the referrer links
+ */
+ private static class Link {
+ private String uri;
+ private String referrer;
+ private int linkDepth;
+
+ public Link(String uri, String referrer, int linkDepth) {
+ this.uri = uri;
+ this.referrer = referrer;
+ this.linkDepth = linkDepth;
+ }
+
+ public String getURI() {
+ return uri;
+ }
+
+ public String getReferrer() {
+ return referrer;
+ }
+
+ public int getDepth() {
+ return linkDepth;
+ }
+
+ public boolean equals(Link l) {
+ return uri.equals(l.getURI());
+ }
+ }
+
+ /**
+ * Configure the crawler component.
+ * <p>
+ * Configure can specify which URI to include, and which URI to exclude
+ * from crawling. You specify the patterns as regular expressions.
+ * </p>
+ * <p>
+ * Morover you can configure
+ * the required content-type of crawling request, and the
+ * query-string appended to each crawling request.
+ * </p>
+ * <pre><tt>
+ * <include>.*\.html?</include> or <include>.*\.html?,
.*\.xsp</include>
+ * <exclude>.*\.gif</exclude> or <exclude>.*\.gif,
.*\.jpe?g</exclude>
+ * <link-content-type> application/x-cocoon-links
</link-content-type>
+ * <link-view-query> ?cocoon-view=links </link-view-query>
+ * <user-agent> Cocoon </user-agent>
+ * <accept> text/xml </accept>
+ * </tt></pre>
+ *
+ * @param configuration XML configuration of this avalon
component.
+ * @exception ConfigurationException is throwing if configuration is
invalid.
+ * @since
+ */
+ public void configure(Configuration configuration)
+ throws ConfigurationException {
+
+ Configuration[] children;
+ children = configuration.getChildren(INCLUDE_CONFIG);
+ if (children.length > 0) {
+ includeCrawlingURL = new HashSet();
+ for (int i = 0; i < children.length; i++) {
+ String pattern = children[i].getValue();
+ try {
+ String params[] = StringUtils.split(pattern, ", ");
+ for (int index = 0; index < params.length; index++) {
+ String tokenized_pattern = params[index];
+ this.includeCrawlingURL.add(new RE(tokenized_pattern));
+ }
+ } catch (RESyntaxException rese) {
+ getLogger().error("Cannot create including
regular-expression for " +
+ pattern, rese);
+ }
+ }
+ }
+
+ children = configuration.getChildren(EXCLUDE_CONFIG);
+ if (children.length > 0) {
+ excludeCrawlingURL = new HashSet();
+ for (int i = 0; i < children.length; i++) {
+ String pattern = children[i].getValue();
+ try {
+ String params[] = StringUtils.split(pattern, ", ");
+ for (int index = 0; index < params.length; index++) {
+ String tokenized_pattern = params[index];
+ this.excludeCrawlingURL.add(new RE(tokenized_pattern));
+ }
+ } catch (RESyntaxException rese) {
+ getLogger().error("Cannot create excluding
regular-expression for " +
+ pattern, rese);
+ }
+ }
+ } else {
+ excludeCrawlingURL = new HashSet();
+ setDefaultExcludeFromCrawling();
+ }
+
+ Configuration child;
+ String value;
+ child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
+ if (child != null) {
+ value = child.getValue();
+ if (value != null && value.length() > 0) {
+ this.linkContentType = value.trim();
+ }
+ }
+ child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
+ if (child != null) {
+ value = child.getValue();
+ if (value != null && value.length() > 0) {
+ this.linkViewQuery = value.trim();
+ }
+ }
+ }
+
+ public void setup(SourceResolver resolver, Map objectModel, String src,
Parameters par)
+ throws ProcessingException, SAXException, IOException {
+
+ Request request = ObjectModelHelper.getRequest(objectModel);
+ Session session = RepositoryUtil.getSession(request, getLogger());
+ this.identityMap = new DocumentIdentityMap(session, this.manager,
getLogger());
+
+ super.setup(resolver, objectModel, src, par);
+ this.src = src;
+ this.depth = par.getParameterAsInteger("depth", 1);
+
+ /* Create a reusable attributes for creating nodes */
+ this.attributes = new AttributesImpl();
+ }
+
+ /**
+ * Generate XML data.
+ *
+ * @throws SAXException
+ * if an error occurs while outputting the document
+ * @throws ProcessingException
+ * if the requsted URI wasn't found
+ */
+ public void generate()
+ throws SAXException, ProcessingException {
+
+ crawled = new HashSet();
+ linksToProcess = new HashSet();
+
+ //this first node should be handled as a cocoon source
+ String root = this.src;
+ URL tempurl = null;
+ linksToProcess.add(new Link(root, "", 0));
+
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("crawl URL " + root);
+ }
+
+ this.contentHandler.startDocument();
+ this.contentHandler.startPrefixMapping(PREFIX, URI);
+
+ attributes.clear();
+ super.contentHandler.startElement(URI, TOP_NODE_NAME, PREFIX + ':' +
TOP_NODE_NAME, attributes);
+
+ while (linksToProcess.size() > 0) {
+ Iterator i = linksToProcess.iterator();
+
+ if (i.hasNext()) {
+ // fetch a URL
+ Link link = (Link) i.next();
+ String uri = link.getURI();
+ int referrerDepth = link.getDepth();
+ // remove it from the to-do list
+ linksToProcess.remove(link);
+ String new_url_link = processURL(uri, link.getReferrer(),
referrerDepth);
+
+ // calc all links from this url
+ if (new_url_link != null && referrerDepth < this.depth) {
+
+ List url_links = getLinksFromConnection(new_url_link, uri,
referrerDepth);
+ if (url_links != null) {
+ // add links of this url to the to-do list
+ linksToProcess.addAll(url_links);
+ }
+ }
+ }
+ }
+
+ super.contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX + ':' +
TOP_NODE_NAME);
+ this.contentHandler.endPrefixMapping(PREFIX);
+ this.contentHandler.endDocument();
+ }
+
+ /**
+ * Default exclude patterns.
+ * <p>
+ * By default URLs matching following patterns are excluded:
+ * </p>
+ * <ul>
+ * <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
+ * <li>.*\\.png(\\?.*)?$ - exclude png images</li>
+ * <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
+ * <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
+ * <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
+ * </ul>
+ *
+ * @since
+ */
+ private void setDefaultExcludeFromCrawling() {
+ String[] EXCLUDE_FROM_CRAWLING_DEFAULT = {
+ ".*\\.gif(\\?.*)?$",
+ ".*\\.png(\\?.*)?$",
+ ".*\\.jpe?g(\\?.*)?$",
+ ".*\\.js(\\?.*)?$",
+ ".*\\.css(\\?.*)?$",
+ ".*\\?.*",".*\\@.*"
+ };
+
+ for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
+ String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
+ try {
+ excludeCrawlingURL.add(new RE(pattern));
+ } catch (RESyntaxException rese) {
+ getLogger().error("Cannot create excluding regular-expression
for " +
+ pattern, rese);
+ }
+ }
+ }
+
+
+ /**
+ * Retrieve a list of links of a url
+ *
+ * @param url_link_string url for requesting links, it is assumed that
+ * url_link_string queries the cocoon view links, ie of the form
+ * <code>http://host/foo/bar?cocoon-view=links</code>
+ * @param url_of_referrer base url of which links are requested, ie of the
form
+ * <code>http://host/foo/bar</code>
+ * @return List of links from url_of_referrer, as result of requesting url
+ * url_link_string
+ */
+ protected List getLinksFromConnection(String url_link_string, String
url_of_referrer, int referrerDepth) {
+ List url_links = null;
+ BufferedReader br = null;
+ try {
+
+ url_links = new ArrayList();
+ url_link_string = "cocoon:/" + url_link_string;
+
+ inputSource = super.resolver.resolveURI(url_link_string);
+ InputStream is = inputSource.getInputStream();
+ br = new BufferedReader(new InputStreamReader(is));
+
+ // content is supposed to be a list of links,
+ // relative to current URL
+ String line;
+ String referrer = url_of_referrer.toString();
+
+ while ((line = br.readLine()) != null) {
+ String new_url = new String(line);
+ boolean add_url = true;
+ // don't add new_url twice
+ if (add_url) {
+ add_url &= !url_links.contains(new_url);
+ }
+
+ // don't add new_url if it has been crawled already
+ if (add_url) {
+ add_url &= !crawled.contains(new_url.toString());
+ }
+
+ Link new_link = new Link(line, referrer, referrerDepth+1);
+ if (add_url) {
+ add_url &= !linksToProcess.contains(new_link);
+ }
+
+ // don't add if is not matched by existing include
definition
+ if (add_url) {
+ add_url &= isIncludedURL(new_url.toString());
+ }
+
+ //don't add id matched by existing exclude definition
+ if (add_url) {
+ add_url &= !(isExcludedURL(new_url.toString()));
+ }
+
+ if (add_url) {
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("Add URL: " +
new_url.toString());
+ }
+ url_links.add(new_link);
+ }
+ }
+ // now we have a list of URL which should be examined
+
+ } catch (IOException ioe) {
+ getLogger().warn("Problems get links of " + url_link_string, ioe);
+ } finally {
+ // explictly close the stream
+ if (br != null) {
+ try {
+ br.close();
+ br = null;
+ } catch (IOException ignored) {
+ }
+ }
+ }
+ return url_links;
+ }
+
+ /**
+ * Generate xml attributes of a url, calculate url for retrieving links
+ *
+ * @param url to process
+ * @param referrer of the url
+ * @return String url for retrieving links, or null if url is an
excluded-url,
+ * and not an included-url.
+ */
+ protected String processURL(String uri, String referrer, int
referrerDepth) throws SAXException {
+
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("getLinks URL " + uri);
+ }
+
+ String result = null;
+
+ // don't try to investigate a url which has been crawled already
+ if (crawled.contains(uri)) {
+ return null;
+ }
+
+ //TODO: need to respect robots.txt
+
+ // mark it as crawled
+ crawled.add(uri);
+
+ attributes.clear();
+ attributes.addAttribute("", HREF_ATTR_NAME,
+ HREF_ATTR_NAME, "CDATA", uri);
+ attributes.addAttribute("", REFERRER_ATTR_NAME,
+ REFERRER_ATTR_NAME, "CDATA", referrer);
+
+ // Output url, referrer, content-type, status, message for traversable
url's
+ HttpURLConnection h = null;
+ URL url = null;
+ String newURL = null;
+ try {
+ String content_type = "text/html";
+ String responseMessage = "not found";
+ int responseCode = 404;
+ if (uri.startsWith("http://")) {
+ url = new URL(uri);
+ URLConnection links_url_connection = url.openConnection();
+ h = (HttpURLConnection) links_url_connection;
+ h.setRequestMethod("HEAD"); //lets be kind to external sites
+ content_type = links_url_connection.getContentType();
+ responseMessage = h.getResponseMessage();
+ responseCode = h.getResponseCode();
+ } else {
+ String tempURI = new String(uri);
+ if (!(uri.startsWith("/"))) {
+ String contextURI =
referrer.substring(0,referrer.lastIndexOf("/")+1);
+ tempURI = contextURI + uri;
+ }
+
+ //see if the document exists
+ if (this.identityMap.isDocument(tempURI)) {
+ content_type = "text/html";
+ responseMessage = "ok";
+ responseCode = 200;
+ newURL = tempURI;
+ } else {
+ //see if the resource exists
+ }
+ }
+
+ attributes.addAttribute("", CONTENT_ATTR_NAME,
+ CONTENT_ATTR_NAME, "CDATA",
+ content_type);
+
+ attributes.addAttribute("", MESSAGE_ATTR_NAME,
+ MESSAGE_ATTR_NAME, "CDATA",
+ responseMessage);
+
+ attributes.addAttribute("", STATUS_ATTR_NAME,
+ STATUS_ATTR_NAME, "CDATA",
+ String.valueOf(responseCode));
+ } catch (IOException ioe) {
+ attributes.addAttribute("", MESSAGE_ATTR_NAME,
+ MESSAGE_ATTR_NAME, "CDATA",
+ ioe.getMessage());
+ } catch (final Exception e1) {
+ attributes.addAttribute("", MESSAGE_ATTR_NAME,
+ MESSAGE_ATTR_NAME, "CDATA",
+ e1.getMessage());
+ } finally {
+ if (h != null) {
+ h.disconnect();
+ }
+ }
+
+ // don't try to get links of a url which is excluded from crawling
+ // try to get links of a url which is included for crawling
+ if (!isExcludedURL(uri) && isIncludedURL(uri)) {
+ // add prefix and query to get data from the linkserializer.
+ if(newURL != null) {
+ if (newURL.indexOf("?") > -1) {
+ newURL = newURL.substring(0,newURL.indexOf("?")) +
linkViewQuery;
+ } else {
+ newURL = newURL + "?" + linkViewQuery;
+ }
+ }
+ }
+
+ //linkrewriter transformer takes care of internal links
+ if (uri.startsWith("http://")) {
+ super.contentHandler.startElement(URI, LINK_NODE_NAME, PREFIX + ':'
+ LINK_NODE_NAME, attributes);
+ super.contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX + ':' +
LINK_NODE_NAME);
+ }
+
+ return newURL;
+ }
+
+ /**
+ * check if URL is a candidate for indexing
+ *
+ * @param url Description of Parameter
+ * @return The excludedURL value
+ * @since
+ */
+ private boolean isExcludedURL(String url) {
+ // by default include URL for crawling
+ if (excludeCrawlingURL == null) {
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("exclude no URL " + url);
+ }
+ return false;
+ }
+
+ final String s = url;
+ Iterator i = excludeCrawlingURL.iterator();
+ while (i.hasNext()) {
+ RE pattern = (RE) i.next();
+ if (pattern.match(s)) {
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("exclude URL " + url);
+ }
+ return true;
+ }
+ }
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("exclude not URL " + url);
+ }
+ return false;
+ }
+
+
+ /**
+ * check if URL is a candidate for indexing
+ *
+ * @param url Description of Parameter
+ * @return The includedURL value
+ * @since
+ */
+ private boolean isIncludedURL(String url) {
+ // by default include URL for crawling
+ if (includeCrawlingURL == null) {
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("include all URL " + url);
+ }
+ return true;
+ }
+
+ final String s = url;
+ Iterator i = includeCrawlingURL.iterator();
+ while (i.hasNext()) {
+ RE pattern = (RE) i.next();
+ if (pattern.match(s)) {
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("include URL " + url);
+ }
+ return true;
+ }
+ }
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("include not URL " + url);
+ }
+ return false;
+ }
+
+ public void recycle() {
+ if (null != this.inputSource) {
+ super.resolver.release(this.inputSource);
+ this.inputSource = null;
+ }
+ this.manager.release(super.resolver);
+ super.resolver = null;
+ this.manager = null;
+ this.attributes = null;
+ super.recycle();
+ }
+}
Propchange:
lenya/trunk/src/modules/linkcheck/java/src/org/apache/lenya/cms/cocoon/generation/LinkStatusGenerator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lenya/trunk/src/modules/linkcheck/resources/linkreporter.js
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/modules/linkcheck/resources/linkreporter.js?rev=373570&view=auto
==============================================================================
--- lenya/trunk/src/modules/linkcheck/resources/linkreporter.js (added)
+++ lenya/trunk/src/modules/linkcheck/resources/linkreporter.js Mon Jan 30
11:12:39 2006
@@ -0,0 +1,98 @@
+/*
+* Copyright 1999-2004 The Apache Software Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+var req;
+
+function processReqChange() {
+ // only if req shows "loaded"
+ if (req.readyState == 4) {
+ // only if "OK"
+ if (req.status == 200) {
+ // ...processing statements go here...
+ //parse link report for broken links
+ var rptLinks = req.responseXML.getElementsByTagName("link");
+ var brokenLinks = new Array(rptLinks.length);
+ brokenCount=0;
+ for(var i = 0; i < rptLinks.length; i++) { // Loop through the
returned links
+ if (rptLinks[i].hasAttribute("status") &&
rptLinks[i].getAttribute("status") == "404") {
+ brokenLinks[brokenCount++] =
rptLinks[i].getAttribute("href");
+ }
+ }
+ if (brokenCount > 0) {
+ //get link elements from dom
+ var links =
document.getElementById("page").getElementsByTagName("a");
+ for (var i = 0; i < links.length; i++) { // Loop through the
links in the doc
+ //for each link, check to see if it is in broken list
+ for (var j = 0; j < brokenLinks.length; j++) {
+ if (brokenLinks[j] == links[i]) {
+ //if it is, give it class attribute with value
"brokenlink"
+ links[i].setAttribute("class", "brokenlink")
+ }
+ }
+ }
+ }
+ } else {
+ alert("There was a problem retrieving the XML data:\n" +
+ req.statusText);
+ }
+ }
+}
+
+function loadXMLDoc(url) {
+ req = false;
+ // branch for native XMLHttpRequest object
+ if(window.XMLHttpRequest) {
+ try {
+ req = new XMLHttpRequest();
+ } catch(e) {
+ req = false;
+ }
+ // branch for IE/Windows ActiveX version
+ } else if(window.ActiveXObject) {
+ try {
+ req = new ActiveXObject("Msxml2.XMLHTTP");
+ } catch(e) {
+ try {
+ req = new ActiveXObject("Microsoft.XMLHTTP");
+ } catch(e) {
+ req = false;
+ }
+ }
+ }
+ if(req) {
+ req.onreadystatechange = processReqChange;
+ req.open("GET", url, true);
+ req.send("");
+ }
+}
+
+reportlinks = function() {
+ //get link report
+ loadXMLDoc("?lenya.usecase=linkcheck.getLinks&asXML=true");
+}
+
+// assign reportlinks function to onload
+
+function addOnLoad(newFunction) {
+ var oldOnload = window.onload;
+ if (typeof window.onload != 'function') {
+ window.onload = newFunction;
+ } else {
+ window.onload = function() { oldOnload(); newFunction(); }
+ }
+}
+
+addOnLoad(reportlinks);
\ No newline at end of file
Propchange: lenya/trunk/src/modules/linkcheck/resources/linkreporter.js
------------------------------------------------------------------------------
svn:eol-style = native
Added: lenya/trunk/src/modules/linkcheck/usecases/linkcheck/getLinks.jx
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/modules/linkcheck/usecases/linkcheck/getLinks.jx?rev=373570&view=auto
==============================================================================
--- lenya/trunk/src/modules/linkcheck/usecases/linkcheck/getLinks.jx (added)
+++ lenya/trunk/src/modules/linkcheck/usecases/linkcheck/getLinks.jx Mon Jan 30
11:12:39 2006
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 1999-2005 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<cinclude:includexml xmlns:cinclude="http://apache.org/cocoon/include/1.0">
+
<cinclude:src>cocoon://core/modules/linkcheck/linkcheck/getLinks.xml</cinclude:src>
+</cinclude:includexml>
\ No newline at end of file
Added: lenya/trunk/src/modules/linkcheck/usecases/linkcheck/usecase.xmap
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/modules/linkcheck/usecases/linkcheck/usecase.xmap?rev=373570&view=auto
==============================================================================
--- lenya/trunk/src/modules/linkcheck/usecases/linkcheck/usecase.xmap (added)
+++ lenya/trunk/src/modules/linkcheck/usecases/linkcheck/usecase.xmap Mon Jan
30 11:12:39 2006
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 1999-2005 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ This sitemap handles the creation of link reports
+-->
+<map:sitemap xmlns:map="http://apache.org/cocoon/sitemap/1.0">
+
+ <map:components>
+ <map:generators default="file">
+ <map:generator name="linkStatus"
src="org.apache.lenya.cms.cocoon.generation.LinkStatusGenerator">
+ </map:generator>
+ </map:generators>
+
+ <map:selectors default="browser">
+ <map:selector logger="sitemap.selector.lastmod" name="last-mod"
src="org.apache.lenya.cms.cocoon.selection.LastModSourceSelector"/>
+ </map:selectors>
+ </map:components>
+
+ <!-- =========================== Pipelines
================================ -->
+
+ <map:pipelines>
+
+ <map:pipeline type="noncaching">
+ <map:match pattern="getLinks.xml">
+ <map:select type="last-mod">
+ <map:parameter name="compare-to"
value="lenya://lenya/pubs/{page-envelope:publication-id}/content/{page-envelope:area}/{page-envelope:document-path}"/>
+ <!-- Read from cache -->
+ <!-- If configured within Apache then mod_lenya will nevertheless
read from cache -->
+ <map:when
test="context://lenya/pubs/{page-envelope:publication-id}/work/cache/{page-envelope:area}/{page-envelope:document-id}.linkreport">
+ <map:generate
src="context://lenya/pubs/{page-envelope:publication-id}/work/cache/{page-envelope:area}/{page-envelope:document-id}.linkreport"
mime-type="text/xml; charset=utf-8"/>
+ <map:serialize type="xml"/>
+ </map:when>
+ <!-- Write to cache and serialize -->
+ <map:otherwise>
+ <map:generate type="linkStatus"
src="/{page-envelope:publication-id}/{page-envelope:area}/{page-envelope:document-id}.html">
+ <map:parameter name="depth" value="1"/>
+ </map:generate>
+ <map:transform
src="fallback://lenya/xslt/util/strip_namespaces.xsl"/>
+ <map:transform
src="fallback://lenya/xslt/authoring/edit/addSourceTags.xsl">
+ <map:parameter name="source"
value="context://lenya/pubs/{page-envelope:publication-id}/work/cache/{page-envelope:area}/{page-envelope:document-id}.linkreport"/>
+ </map:transform>
+ <map:transform type="write-source">
+ <map:parameter name="serializer" value="xml"/>
+ </map:transform>
+ <map:transform
src="fallback://lenya/xslt/authoring/edit/removeSourceTags.xsl"/>
+ <map:serialize type="xml"/>
+ </map:otherwise>
+ </map:select>
+ </map:match>
+
+ </map:pipeline>
+
+ </map:pipelines>
+
+</map:sitemap>
\ No newline at end of file
Propchange: lenya/trunk/src/modules/linkcheck/usecases/linkcheck/usecase.xmap
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lenya/trunk/src/pubs/default/sitemap.xmap
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/pubs/default/sitemap.xmap?rev=373570&r1=373569&r2=373570&view=diff
==============================================================================
--- lenya/trunk/src/pubs/default/sitemap.xmap (original)
+++ lenya/trunk/src/pubs/default/sitemap.xmap Mon Jan 30 11:12:39 2006
@@ -196,9 +196,10 @@
<map:otherwise>
<map:match pattern="authoring/**.html">
<map:transform
src="cocoon://lenya-page/{page-envelope:publication-id}/{../../../1}/{../../../2}.xml?doctype={page-envelope:document-type}"/>
+ <!-- TODO: External Links checking should be optional on
a document by document basis -->
<!-- uncomment to turn on external broken link reporting
-->
<!--<map:transform
src="fallback://lenya/xslt/authoring/addJavaScript.xsl">
- <map:parameter name="scriptSRC"
value="/{page-envelope:publication-id}/authoring/javascript/linkreporter.js"/>
+ <map:parameter name="scriptSRC"
value="/modules/linkcheck/linkreporter.js"/>
</map:transform>-->
</map:match>
<map:transform
src="fallback://lenya/xslt/util/strip_namespaces.xsl"/>
Modified: lenya/trunk/src/webapp/lenya/usecases/usecase.xmap
URL:
http://svn.apache.org/viewcvs/lenya/trunk/src/webapp/lenya/usecases/usecase.xmap?rev=373570&r1=373569&r2=373570&view=diff
==============================================================================
--- lenya/trunk/src/webapp/lenya/usecases/usecase.xmap (original)
+++ lenya/trunk/src/webapp/lenya/usecases/usecase.xmap Mon Jan 30 11:12:39 2006
@@ -75,11 +75,16 @@
<map:when test="true">
<map:serialize type="xml"/>
</map:when>
- <map:otherwise>
- <map:transform
src="fallback://lenya/xslt/util/strip_namespaces.xsl"/>
- <map:serialize type="xhtml"/>
- </map:otherwise>
</map:select>
+ <map:select type="request-parameter">
+ <map:parameter name="parameter-name" value="asXML"/>
+ <map:when test="true">
+ <map:serialize type="xml"/>
+ </map:when>
+ </map:select>
+
+ <map:transform src="fallback://lenya/xslt/util/strip_namespaces.xsl"/>
+ <map:serialize type="xhtml"/>
</map:match>
|