Obsolete Pages{{Obsolete}}
The official documentation is at: http://docs.alfresco.com
This page describes a Alfresco Transformer that converts content of mime-type 'application/xhtml+xml' to text so that XHTML documents stored in Alfresco get indexed properly.
This AMP was not developed by Alfresco engineers but contributed by community members; it is therefore neither official nor does it come with any kind of guarantee. You are free to use it, however, for whatever you want.
A previous implementation (which lacked the caching of the required DTD and entity sets) can be seen in this post.
The code needed for this reads as follows:
import java.io.File;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.transform.AbstractContentTransformer;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
/**
* (Based on Alfresco's HtmlParserContentTransformer implementation.)
*/
public class XHtmlParserContentTransformer extends AbstractContentTransformer
{
private static final Log logger = LogFactory.getLog(XHtmlParserContentTransformer.class);
/**
* Name of the package this class resides in. This is used to locate the DTD's and .ent files that
* the parser needs and which we store locally (instead of telling the parser to load them from
* w3.org).
*
* @see #transformInternal(ContentReader, ContentWriter, Map)
*/
private static final String PACKAGE_NAME = XHtmlParserContentTransformer.class.getPackage().getName();
/**
* Names of the entities which we store locally, instead of directing the parser to w3.org.
*
* @see #transformInternal(ContentReader, ContentWriter, Map)
*/
private static final String[] CACHED_ENTITIES = {
'xhtml-basic10-f.dtd',
'xhtml-lat1.ent',
'xhtml-mobile10-flat.dtd',
'xhtml-special.ent',
'xhtml-symbol.ent',
'xhtml1-frameset.dtd',
'xhtml1-strict.dtd',
'xhtml1-transitional.dtd',
'xhtml11-flat.dtd'
};
/**
* Returns the appropriateness of this transformer: we only support XHTML to TEXT.
*/
public double getReliability(String sourceMimetype, String targetMimetype)
{
if (!MimetypeMap.MIMETYPE_XHTML.equals(sourceMimetype) || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// Only support XHTML -> TEXT
return 0.0;
}
else
{
return 1.0;
}
}
@Override
public void transformInternal(ContentReader reader, ContentWriter writer, Map<String, Object> options)
throws Exception
{
// We can only work from a file, so create a temporary one
File xhtmlFile = TempFileProvider.createTempFile('HtmlParserContentTransformer_', '.html');
reader.getContent(xhtmlFile);
// Get a parser (TODO: create a pool of parsers for efficiency)
final String parserName = 'org.apache.xerces.parsers.SAXParser';
XMLReader parser = (XMLReader) Class.forName(parserName).newInstance();
// Avoid downloading DTD's (and other files) from w3.org
parser.setEntityResolver(new EntityResolver()
{
public InputSource resolveEntity(String publicId, String systemId)
{
for (String s : CACHED_ENTITIES)
{
if (systemId.endsWith(s))
{
return new InputSource(XHtmlParserContentTransformer.this.getClass().getClassLoader().getResource(
PACKAGE_NAME + '/' + s).toString());
}
}
return new InputSource(systemId);
}
});
// Disable validation (TODO: make this a configurable setting)
parser.setFeature('http://xml.org/sax/features/validation', false);
// parser.setFeature( 'http://xml.org/sax/features/namespaces', false);
// parser.setFeature( 'http://apache.org/xml/features/validation/schema',
// setSchemaSupport );
// parser.setFeature(
// 'http://apache.org/xml/features/validation/schema-full-checking', false);
parser.setFeature('http://apache.org/xml/features/continue-after-fatal-error', true);
// Create the extractor
Converter converter = new Converter(parser, xhtmlFile, reader.getContentUrl());
// Extract
String text = converter.getText();
// Save
writer.putContent(text);
}
private class Converter extends DefaultHandler
{
private final XMLReader parser;
private final StringBuilder text;
private final String debugInfo; // for debugging only
public Converter(XMLReader parser, File xhtmlFile, String debugInfo) throws Exception
{
this.parser = parser;
this.text = new StringBuilder();
this.debugInfo = debugInfo;
// Set up parser
parser.setContentHandler(this);
parser.setErrorHandler(this);
// Parse
String path = 'file://' + xhtmlFile.getAbsolutePath();
long before = System.currentTimeMillis();
parser.parse(path);
long after = System.currentTimeMillis();
logger.debug('Conversion time: ' + (after - before) + 'ms.');
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException
{
// logger.debug('Chs: \''+ String.valueOf(ch, start, length)+'\'.');
text.append(ch, start, length);
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException
{
// logger.debug('Ign chs: \''+String.valueOf(ch, start, length)+'\'.');
if (length > 0) text.append(' ');
}
public final String getText()
{
String words = text.toString();
// logger.debug('Text is: ' + words); // tmp
if (logger.isDebugEnabled()) logger.debug('Text is: ' + words);
return words;
}
@Override
public void error(SAXParseException e) throws SAXException
{
logger.error('Conversion failed due to error: ' + e + ' ' + debugInfo);
throw e;
}
@Override
public void fatalError(SAXParseException e) throws SAXException
{
logger.error('Conversion failed due to error: ' + e + ' ' + debugInfo);
throw e;
}
}
}
<bean id='transformer.XHtmlParser' class='org.icarbasel.module.Derivatives.transformers.XHtmlParserContentTransformer' parent='baseContentTransformer' />
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
Ask for and offer help to other Alfresco Content Services Users and members of the Alfresco team.
Related links:
By using this site, you are agreeing to allow us to collect and use cookies as outlined in Alfresco’s Cookie Statement and Terms of Use (and you have a legitimate interest in Alfresco and our products, authorizing us to contact you in such methods). If you are not ok with these terms, please do not use this website.