Indexing PDF with sitecore 7.5 and a custom crawler using ITextSharp

As you probably know sitecore index PDF using Adobe iFilter…

Adobe iFilter technology is not really friendly and it make use of COM objects, it means that you are going to have a lot of security issues and dependencies on the COM objects.

On the top of it, it seams that there are proven solution based on the iFilter up to the version 9, but unfortunately now you can download only the version 11  you can read more about this issue here 

So I can reccomend two solutions:

  1. Buy the license of a third party tool (like FoxIt)
  2. Write your own media crawler following this post

When it come to parsing PDF there are several options, I have chosen ITextSharp that seems widely used and supported.

This is the code that you need for you custom media crawler:

using System;
using System.IO;
using System.Text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Sitecore.ContentSearch;
using Sitecore.ContentSearch.ComputedFields;
using Sitecore.ContentSearch.Diagnostics;
using Sitecore.ContentSearch.Extracters.IFilterTextExtraction;
using Sitecore.Data.Items;
using Sitecore.Diagnostics;

namespace xxx.Crawler.Pdf
{
    public class MediaContentExtractor : IComputedIndexField
    {
        public string FieldName { get; set; }
        public string ReturnType { get; set; }

        public object ComputeFieldValue(IIndexable indexable)
        {
            Item item = (SitecoreIndexableItem) indexable;
            Assert.ArgumentNotNull(item, "item");

            object result = null;
            if (item != null && item.Paths.IsMediaItem)
            {
                MediaItem _media = item;
                string ext = _media.Extension.ToLower();
                if (ext == "pdf" || _media.MimeType == "application/pdf")
                {
                    result = ParsePDF(_media);
                }
                else
                {
                    result = ParseItemsWithIfilters(_media);
                }
            }

            return result;
        }


        private string ParsePDF(MediaItem mediaItem)
        {
            ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

            var builder = new StringBuilder();
            if (mediaItem != null)
            {
                try
                {
                    var reader = new PdfReader(mediaItem.GetMediaStream());
                    if (reader.Info.ContainsKey("Title"))
                    {
                        builder.Append(reader.Info["Title"]);
                    }
                    if (reader.Info.ContainsKey("Subject"))
                    {
                        builder.Append(reader.Info["Subject"]);
                    }

                    if (reader.Info.ContainsKey("Keywords"))
                    {
                        builder.Append(reader.Info["Keywords"]);
                    }

                    for (int pagenumber = 1; pagenumber <= reader.NumberOfPages; pagenumber++)
                    {
                        builder.Append(PdfTextExtractor.GetTextFromPage(reader, pagenumber, strategy));
                    }
                }
                catch (Exception ex)
                {
                    CrawlingLog.Log.Error(ex.ToString(), ex);
                    return string.Empty;
                }
            }
            return builder.ToString();
        }


        private string ParseItemsWithIfilters(MediaItem mediaItem)
        {
            string content = string.Empty;
            try
            {
                Stream streamReader = mediaItem.GetMediaStream();
                TextReader reader = new FilterReader(((FileStream) streamReader).Name);
                using (reader)
                {
                    content = reader.ReadToEnd();
                }
            }
            catch (Exception ex)
            {
                CrawlingLog.Log.Error(ex.ToString(), ex);
            }

            if (!string.IsNullOrWhiteSpace(content))
            {
                content = content.Replace("\r\n", string.Empty).ToLower();
            }

            return content;
        }
    }
}

obviously you need also to amend the file: Sitecore.ContentSearch.Lucene.DefaultIndexConfiguration

<!--<field fieldName="_content"                 type="Sitecore.ContentSearch.ComputedFields.MediaItemContentExtractor,Sitecore.ContentSearch">
<mediaIndexing ref="contentSearch/indexConfigurations/defaultLuceneIndexConfiguration/mediaIndexing">
            </field>-->
<field fieldName="_content" storageType="no" indexType="tokenized">xxx.Crawler.Pdf.MediaContentExtractor, xxx.Crawler.Pdf</field>
Advertisements