XHTML to TEXT Transformer

cancel
Showing results for 
Search instead for 
Did you mean: 

XHTML to TEXT Transformer

resplin
Intermediate
0 0 2,171

Obsolete Pages{{Obsolete}}

The official documentation is at: http://docs.alfresco.com




Introduction


This page describes a Alfresco Transformer that converts content of mime-type 'application/xhtml+xml' to text so that XHTML documents stored in Alfresco get indexed properly.

This AMP was not developed by Alfresco engineers but contributed by community members; it is therefore neither official nor does it come with any kind of guarantee. You are free to use it, however, for whatever you want.

A previous implementation (which lacked the caching of the required DTD and entity sets) can be seen in this post.


Code


The code needed for this reads as follows:


 import java.io.File;
import java.util.Map;

import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.transform.AbstractContentTransformer;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
  * (Based on Alfresco's HtmlParserContentTransformer implementation.)
  */
public class XHtmlParserContentTransformer extends AbstractContentTransformer
{
   private static final Log logger = LogFactory.getLog(XHtmlParserContentTransformer.class);

   /**
    * Name of the package this class resides in. This is used to locate the DTD's and .ent files that
    * the parser needs and which we store locally (instead of telling the parser to load them from
    * w3.org).
    *
    * @see #transformInternal(ContentReader, ContentWriter, Map)
    */
   private static final String PACKAGE_NAME = XHtmlParserContentTransformer.class.getPackage().getName();

   /**
    * Names of the entities which we store locally, instead of directing the parser to w3.org.
    *
    * @see #transformInternal(ContentReader, ContentWriter, Map)
    */
   private static final String[] CACHED_ENTITIES = {
       'xhtml-basic10-f.dtd',
       'xhtml-lat1.ent',
       'xhtml-mobile10-flat.dtd',
       'xhtml-special.ent',
       'xhtml-symbol.ent',
       'xhtml1-frameset.dtd',
       'xhtml1-strict.dtd',
       'xhtml1-transitional.dtd',
       'xhtml11-flat.dtd'
   };

   /**
    * Returns the appropriateness of this transformer: we only support XHTML to TEXT.
    */
   public double getReliability(String sourceMimetype, String targetMimetype)
   {
     if (!MimetypeMap.MIMETYPE_XHTML.equals(sourceMimetype) || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
     {
       // Only support XHTML -> TEXT
       return 0.0;
     }
     else
     {
       return 1.0;
     }
   }

   @Override
   public void transformInternal(ContentReader reader, ContentWriter writer, Map<String, Object> options)
       throws Exception
   {
     // We can only work from a file, so create a temporary one
     File xhtmlFile = TempFileProvider.createTempFile('HtmlParserContentTransformer_', '.html');
     reader.getContent(xhtmlFile);

     // Get a parser (TODO: create a pool of parsers for efficiency)
     final String parserName = 'org.apache.xerces.parsers.SAXParser';
     XMLReader parser = (XMLReader) Class.forName(parserName).newInstance();

     // Avoid downloading DTD's (and other files) from w3.org
     parser.setEntityResolver(new EntityResolver()
     {
       public InputSource resolveEntity(String publicId, String systemId)
       {
         for (String s : CACHED_ENTITIES)
         {
           if (systemId.endsWith(s))
           {
             return new InputSource(XHtmlParserContentTransformer.this.getClass().getClassLoader().getResource(
                 PACKAGE_NAME + '/' + s).toString());
           }
         }

         return new InputSource(systemId);
       }
     });

     // Disable validation (TODO: make this a configurable setting)
     parser.setFeature('http://xml.org/sax/features/validation', false);
     // parser.setFeature( 'http://xml.org/sax/features/namespaces', false);
     // parser.setFeature( 'http://apache.org/xml/features/validation/schema',
     // setSchemaSupport );
     // parser.setFeature(
     // 'http://apache.org/xml/features/validation/schema-full-checking', false);
     parser.setFeature('http://apache.org/xml/features/continue-after-fatal-error', true);

     // Create the extractor
     Converter converter = new Converter(parser, xhtmlFile, reader.getContentUrl());

     // Extract
     String text = converter.getText();

     // Save
     writer.putContent(text);
   }

   private class Converter extends DefaultHandler
   {
     private final XMLReader parser;
     private final StringBuilder text;
     private final String debugInfo; // for debugging only

     public Converter(XMLReader parser, File xhtmlFile, String debugInfo) throws Exception
     {
       this.parser = parser;
       this.text = new StringBuilder();
       this.debugInfo = debugInfo;

       // Set up parser
       parser.setContentHandler(this);
       parser.setErrorHandler(this);

       // Parse
       String path = 'file://' + xhtmlFile.getAbsolutePath();
       long before = System.currentTimeMillis();
       parser.parse(path);
       long after = System.currentTimeMillis();
       logger.debug('Conversion time: ' + (after - before) + 'ms.');
     }

     @Override
     public void characters(char[] ch, int start, int length) throws SAXException
     {
       // logger.debug('Chs: \''+ String.valueOf(ch, start, length)+'\'.');
       text.append(ch, start, length);
     }

     @Override
     public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException
     {
       // logger.debug('Ign chs: \''+String.valueOf(ch, start, length)+'\'.');
       if (length > 0) text.append(' ');
     }

     public final String getText()
     {
       String words = text.toString();

       // logger.debug('Text is: ' + words); // tmp
       if (logger.isDebugEnabled()) logger.debug('Text is: ' + words);
       return words;
     }

     @Override
     public void error(SAXParseException e) throws SAXException
     {
       logger.error('Conversion failed due to error: ' + e + ' ' + debugInfo);
       throw e;
     }

     @Override
     public void fatalError(SAXParseException e) throws SAXException
     {
       logger.error('Conversion failed due to error: ' + e + ' ' + debugInfo);
       throw e;
     }

   }
}



Installation


  1. Create an (empty) Alfresco Module Package to hold your XHTML-to-Text converter.
  2. In the AMP's source code directory, create a hierarchy of directories and a Java file with the code from above. In my AMP, the above code is stored in the file source/java/org/myorg/module/mymodule/transformers/XHtmlParserContentTransformer.java.
  3. Copy the following files xhtml11-flat.dtd, xhtml1-transitional.dtd, xhtml1-strict.dtd, xhtml1-frameset.dtd, xhtml-symbol.ent, xhtml-special.ent, xhtml-mobile10-flat.dtd, xhtml-lat1.ent, xhtml-basic10-f.dtd, wml13.dtd, wml11.dtd from w3.org to directory source/java/org/myorg/module/mymodule/transformers/.
  4. Add the following to your AMP's module-context.xml file:
 <bean id='transformer.XHtmlParser' class='org.icarbasel.module.Derivatives.transformers.XHtmlParserContentTransformer' parent='baseContentTransformer' />

Content Transformation