Thursday, June 13, 2013

Convert Word documents using Interop API

Have a requirement to convert millions of documents to html, preserving the formatting and style, so trying out Microsoft.Office.Interop.Word's SaveAs API.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using Word = Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.Word;

public class WordTool: IDisposable
  {
    Word._Application oWord;
    object oMissing = System.Reflection.Missing.Value;
    object isVisible = true;
    object readOnly = true;
    object oSaveChanges = false;

    public WordTool()
    {
      // Create an instance of Word.exe
      oWord = new Word.Application();
      oWord.Visible = false;
      oWord.DisplayAlerts = Word.WdAlertLevel.wdAlertsNone;
    }

    public void Convert(string input, string output)
    {
      WdSaveFormat format;
      switch (Path.GetExtension(output.ToLower()))
      {
        case ".doc":
          format = WdSaveFormat.wdFormatDocument;
          break;
        case ".docx":
          format = WdSaveFormat.wdFormatDocumentDefault;
          break;
        case ".htm":
          format = WdSaveFormat.wdFormatHTML;
          break;
        case ".html":
          format = WdSaveFormat.wdFormatFilteredHTML;
          break;
        case ".pdf":
          format = WdSaveFormat.wdFormatPDF;
          break;
        case ".rtf":
          format = WdSaveFormat.wdFormatRTF;
          break;
        case ".mht":
          format = WdSaveFormat.wdFormatWebArchive;
          break;
        case ".xps":
          format = WdSaveFormat.wdFormatXPS;
          break;
        case ".txt":
          format = WdSaveFormat.wdFormatTextLineBreaks;
          break;
        case ".xml":
          format = WdSaveFormat.wdFormatFlatXML;
          break;
        default:
          format = WdSaveFormat.wdFormatText;
          break;
      }

      object oFormat = format;
      object oInput = input;
      object oOutput = output;

      // Load a document into our instance of word.exe
      Word._Document oDoc = oWord.Documents.Open(ref oInput,
        ref oMissing, ref readOnly, ref oMissing,
        ref oMissing, ref oMissing, ref oMissing,
        ref oMissing, ref oMissing, ref oMissing,
        ref oMissing, ref isVisible, ref oMissing,
        ref oMissing, ref oMissing, ref oMissing);

      // Make this document the active document.
      oDoc.Activate();

      // Save this document in Word 2003 format.
      oDoc.SaveAs(ref oOutput, ref oFormat,
        ref oMissing, ref oMissing, ref oMissing,
        ref oMissing, ref oMissing, ref oMissing,
        ref oMissing, ref oMissing, ref oMissing,
        ref oMissing, ref oMissing, ref oMissing,
        ref oMissing, ref oMissing);

      // found temp instance of doc if not closed
      oDoc.Close(ref oSaveChanges, ref oMissing, ref oMissing);
    }

    public void Dispose()
    {
      if (null != oWord)
        oWord.Quit(ref oSaveChanges, ref oMissing, ref oMissing);
    }
  }

This solution based on code originally found on Stack Overflow