XHTML to TEXT Transformer

resplin · ‎6 Jun 2015

The official documentation is at: http://docs.alfresco.com

Introduction

This page describes a Alfresco Transformer that converts content of mime-type 'application/xhtml+xml' to text so that XHTML documents stored in Alfresco get indexed properly.

This AMP was not developed by Alfresco engineers but contributed by community members; it is therefore neither official nor does it come with any kind of guarantee. You are free to use it, however, for whatever you want.

A previous implementation (which lacked the caching of the required DTD and entity sets) can be seen in this post.

Code

The code needed for this reads as follows:

 import java.io.File;
 import java.util.Map;
 
 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.repo.content.transform.AbstractContentTransformer;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.alfresco.service.cmr.repository.ContentWriter;
 import org.alfresco.util.TempFileProvider;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.xml.sax.EntityResolver;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXParseException;
 import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * (Based on Alfresco's HtmlParserContentTransformer implementation.)
  */
 public class XHtmlParserContentTransformer extends AbstractContentTransformer
 {
   private static final Log logger = LogFactory.getLog(XHtmlParserContentTransformer.class);
 
   /**
    * Name of the package this class resides in. This is used to locate the DTD's and .ent files that
    * the parser needs and which we store locally (instead of telling the parser to load them from
    * w3.org).
    * 
    * @see #transformInternal(ContentReader, ContentWriter, Map)
    */
   private static final String PACKAGE_NAME = XHtmlParserContentTransformer.class.getPackage().getName();
 
   /**
    * Names of the entities which we store locally, instead of directing the parser to w3.org.
    * 
    * @see #transformInternal(ContentReader, ContentWriter, Map)
    */
   private static final String[] CACHED_ENTITIES = {
       'xhtml-basic10-f.dtd',
       'xhtml-lat1.ent',
       'xhtml-mobile10-flat.dtd',
       'xhtml-special.ent',
       'xhtml-symbol.ent',
       'xhtml1-frameset.dtd',
       'xhtml1-strict.dtd',
       'xhtml1-transitional.dtd',
       'xhtml11-flat.dtd'
   };
 
   /**
    * Returns the appropriateness of this transformer: we only support XHTML to TEXT.
    */
   public double getReliability(String sourceMimetype, String targetMimetype)
   {
     if (!MimetypeMap.MIMETYPE_XHTML.equals(sourceMimetype) || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
     {
       // Only support XHTML -> TEXT
       return 0.0;
     }
     else
     {
       return 1.0;
     }
   }
 
   @Override
   public void transformInternal(ContentReader reader, ContentWriter writer, Map<String, Object> options)
       throws Exception
   {
     // We can only work from a file, so create a temporary one
     File xhtmlFile = TempFileProvider.createTempFile('HtmlParserContentTransformer_', '.html');
     reader.getContent(xhtmlFile);
 
     // Get a parser (TODO: create a pool of parsers for efficiency)
     final String parserName = 'org.apache.xerces.parsers.SAXParser';
     XMLReader parser = (XMLReader) Class.forName(parserName).newInstance();
 
     // Avoid downloading DTD's (and other files) from w3.org
     parser.setEntityResolver(new EntityResolver()
     {
       public InputSource resolveEntity(String publicId, String systemId)
       {
         for (String s : CACHED_ENTITIES)
         {
           if (systemId.endsWith(s))
           {
             return new InputSource(XHtmlParserContentTransformer.this.getClass().getClassLoader().getResource(
                 PACKAGE_NAME + '/' + s).toString());
           }
         }
 
         return new InputSource(systemId);
       }
     });
 
     // Disable validation (TODO: make this a configurable setting)
     parser.setFeature('http://xml.org/sax/features/validation', false);
     // parser.setFeature( 'http://xml.org/sax/features/namespaces', false);
     // parser.setFeature( 'http://apache.org/xml/features/validation/schema',
     // setSchemaSupport );
     // parser.setFeature(
     // 'http://apache.org/xml/features/validation/schema-full-checking', false);
     parser.setFeature('http://apache.org/xml/features/continue-after-fatal-error', true);
 
     // Create the extractor
     Converter converter = new Converter(parser, xhtmlFile, reader.getContentUrl());
 
     // Extract
     String text = converter.getText();
 
     // Save
     writer.putContent(text);
   }
 
   private class Converter extends DefaultHandler
   {
     private final XMLReader parser;
     private final StringBuilder text;
     private final String debugInfo; // for debugging only
 
     public Converter(XMLReader parser, File xhtmlFile, String debugInfo) throws Exception
     {
       this.parser = parser;
       this.text = new StringBuilder();
       this.debugInfo = debugInfo;
 
       // Set up parser
       parser.setContentHandler(this);
       parser.setErrorHandler(this);
 
       // Parse
       String path = 'file://' + xhtmlFile.getAbsolutePath();
       long before = System.currentTimeMillis();
       parser.parse(path);
       long after = System.currentTimeMillis();
       logger.debug('Conversion time: ' + (after - before) + 'ms.');
     }
 
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException
     {
       // logger.debug('Chs: \''+ String.valueOf(ch, start, length)+'\'.');
       text.append(ch, start, length);
     }
 
     @Override
     public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException
     {
       // logger.debug('Ign chs: \''+String.valueOf(ch, start, length)+'\'.');
       if (length > 0) text.append(' ');
     }
 
     public final String getText()
     {
       String words = text.toString();
 
       // logger.debug('Text is: ' + words); // tmp
       if (logger.isDebugEnabled()) logger.debug('Text is: ' + words);
       return words;
     }
 
     @Override
     public void error(SAXParseException e) throws SAXException
     {
       logger.error('Conversion failed due to error: ' + e + ' ' + debugInfo);
       throw e;
     }
 
     @Override
     public void fatalError(SAXParseException e) throws SAXException
     {
       logger.error('Conversion failed due to error: ' + e + ' ' + debugInfo);
       throw e;
     }
 
   }
 }

Installation

Create an (empty) Alfresco Module Package to hold your XHTML-to-Text converter.
In the AMP's source code directory, create a hierarchy of directories and a Java file with the code from above. In my AMP, the above code is stored in the file source/java/org/myorg/module/mymodule/transformers/XHtmlParserContentTransformer.java.
Copy the following files xhtml11-flat.dtd, xhtml1-transitional.dtd, xhtml1-strict.dtd, xhtml1-frameset.dtd, xhtml-symbol.ent, xhtml-special.ent, xhtml-mobile10-flat.dtd, xhtml-lat1.ent, xhtml-basic10-f.dtd, wml13.dtd, wml11.dtd from w3.org to directory source/java/org/myorg/module/mymodule/transformers/.
Add the following to your AMP's module-context.xml file:

 <bean id='transformer.XHtmlParser' class='org.icarbasel.module.Derivatives.transformers.XHtmlParserContentTransformer' parent='baseContentTransformer' />

Content Transformation

XHTML to TEXT Transformer

XHTML to TEXT Transformer

Introduction

Code

Installation

We use cookies on this site to enhance your user experience