Indexing PDF with sitecore 7.5 and a custom crawler using ITextSharp

As you probably know sitecore index PDF using Adobe iFilter…

Adobe iFilter technology is not really friendly and it make use of COM objects, it means that you are going to have a lot of security issues and dependencies on the COM objects.

On the top of it, it seams that there are proven solution based on the iFilter up to the version 9, but unfortunately now you can download only the version 11  you can read more about this issue here 

So I can reccomend two solutions:

  1. Buy the license of a third party tool (like FoxIt)
  2. Write your own media crawler following this post

When it come to parsing PDF there are several options, I have chosen ITextSharp that seems widely used and supported.

This is the code that you need for you custom media crawler:

using System;
using System.IO;
using System.Text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Sitecore.ContentSearch;
using Sitecore.ContentSearch.ComputedFields;
using Sitecore.ContentSearch.Diagnostics;
using Sitecore.ContentSearch.Extracters.IFilterTextExtraction;
using Sitecore.Data.Items;
using Sitecore.Diagnostics;

namespace xxx.Crawler.Pdf
{
    public class MediaContentExtractor : IComputedIndexField
    {
        public string FieldName { get; set; }
        public string ReturnType { get; set; }

        public object ComputeFieldValue(IIndexable indexable)
        {
            Item item = (SitecoreIndexableItem) indexable;
            Assert.ArgumentNotNull(item, "item");

            object result = null;
            if (item != null && item.Paths.IsMediaItem)
            {
                MediaItem _media = item;
                string ext = _media.Extension.ToLower();
                if (ext == "pdf" || _media.MimeType == "application/pdf")
                {
                    result = ParsePDF(_media);
                }
                else
                {
                    result = ParseItemsWithIfilters(_media);
                }
            }

            return result;
        }


        private string ParsePDF(MediaItem mediaItem)
        {
            ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

            var builder = new StringBuilder();
            if (mediaItem != null)
            {
                try
                {
                    var reader = new PdfReader(mediaItem.GetMediaStream());
                    if (reader.Info.ContainsKey("Title"))
                    {
                        builder.Append(reader.Info["Title"]);
                    }
                    if (reader.Info.ContainsKey("Subject"))
                    {
                        builder.Append(reader.Info["Subject"]);
                    }

                    if (reader.Info.ContainsKey("Keywords"))
                    {
                        builder.Append(reader.Info["Keywords"]);
                    }

                    for (int pagenumber = 1; pagenumber <= reader.NumberOfPages; pagenumber++)
                    {
                        builder.Append(PdfTextExtractor.GetTextFromPage(reader, pagenumber, strategy));
                    }
                }
                catch (Exception ex)
                {
                    CrawlingLog.Log.Error(ex.ToString(), ex);
                    return string.Empty;
                }
            }
            return builder.ToString();
        }


        private string ParseItemsWithIfilters(MediaItem mediaItem)
        {
            string content = string.Empty;
            try
            {
                Stream streamReader = mediaItem.GetMediaStream();
                TextReader reader = new FilterReader(((FileStream) streamReader).Name);
                using (reader)
                {
                    content = reader.ReadToEnd();
                }
            }
            catch (Exception ex)
            {
                CrawlingLog.Log.Error(ex.ToString(), ex);
            }

            if (!string.IsNullOrWhiteSpace(content))
            {
                content = content.Replace("\r\n", string.Empty).ToLower();
            }

            return content;
        }
    }
}

obviously you need also to amend the file: Sitecore.ContentSearch.Lucene.DefaultIndexConfiguration

<!--<field fieldName="_content"                 type="Sitecore.ContentSearch.ComputedFields.MediaItemContentExtractor,Sitecore.ContentSearch">
<mediaIndexing ref="contentSearch/indexConfigurations/defaultLuceneIndexConfiguration/mediaIndexing">
            </field>-->
<field fieldName="_content" storageType="no" indexType="tokenized">xxx.Crawler.Pdf.MediaContentExtractor, xxx.Crawler.Pdf</field>

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s