Today Carved out a chunk of the day to work on I-Filters. I-Filters are COM
dynamic link libraries that convert known file types to text under
Windows XP/2K/2K3. The OS's indexing service uses I-Filters to convert
PDF and Office file types to text so the indexer can tokenize words
contained in files.
I wrote a test application that calls an I-Filter
library given a file name and converts it to text. The correct filter is determined by
examining the file extension and querying the registry (I-Filters are
registered with associated file extensions). My code works great with
Office documents but barfs when using
Adobe's 6.0 I-Filter.
Below is a synopsis of the method that does the work of invoking the
filter (leave a comment if you want the rest of the code). The CLSID is
the class ID of the filter, read from the registry.
(Apologies for no syntax highlighting)
private static string ExecuteFilter(string clsID, string sourceFile)
{
string result = String.Empty;
// Some filters are not reentrant, such as Adobe PDF filter.
lock(_lock)
{
object itfc = null;
try
{
// Get the filter type from CLSID.
Type t = Type.GetTypeFromCLSID(new Guid(clsID));
if (null != t)
{
// Get filter instance.
itfc = Activator.CreateInstance(t);
// Cast to IPersistFile.
IFilter ifilt = (IFilter)(itfc);
System.Runtime.InteropServices.UCOMIPersistFile ipf =
(System.Runtime.InteropServices.UCOMIPersistFile)(ifilt);
// Load source.
ipf.Load(sourceFile, 0);
// Initialize.
uint i = 0;
int hr = 0;
STAT_CHUNK chunk = new STAT_CHUNK();
ifilt.Init(IFILTER_INIT.NONE, 0, null, ref i);
// Read the in chunks.
StringBuilder masterBuffer = new StringBuilder();
while (0 == hr)
{
// Read next chunk structure.
try
{
hr = ifilt.GetChunk(out chunk);
}
catch (COMException ex)
{
//
Get Chunk will throw an exception
// when no more chunks to read - tsk.
if (FILTER_E_END_OF_CHUNKS == ex.ErrorCode)
hr = ex.ErrorCode;
else
throw ex;
}
// if chunk is text..
if (0 == hr && CHUNKSTATE.CHUNK_TEXT == chunk.flags)
{
// Read text to buffer.
uint bufferSize = CHUNK_SIZE;
int hr2 = 0;
while (FILTER_S_LAST_TEXT != hr2 || 0 == hr2)
{
bufferSize = CHUNK_SIZE;
StringBuilder buffer = new StringBuilder((int)bufferSize);
hr2 = ifilt.GetText(ref bufferSize, buffer);
masterBuffer.Append(buffer.ToString(0, (int)bufferSize));
}
// Did we get an error?
if
(FILTER_E_NO_MORE_TEXT != hr2 && FILTER_S_LAST_TEXT != hr2)
throw new Exception("Failed reading data from chunk!");
}
}
// Assign result.
result = masterBuffer.ToString();
}
}
catch (Exception ex)
{
throw new FileLoadException("Failed to read data from filter!", ex);
}
finally
{
if (null != itfc)
Marshal.ReleaseComObject(itfc);
}
}
return result;
}