Parse MS Office Word file using OLE in Perl.


/ Published in: Perl
Save to your folder(s)

use Object Linking Environment of MS to manupilation office 2003/2007 word file.


Copy this code and paste it in your HTML
  1. #!/path/to/perl.exe -w
  2. # [email protected] @ 2009/7/8 (version according to date)
  3. # use Object Linking Environment of MS to manupilation office 2003/2007 word file.
  4.  
  5. use Win32::OLE qw(in with);
  6. use strict;
  7.  
  8. my $VERSION = "2009/07/08";
  9. my $usage =
  10. "Usage of Ver: $VERSION:
  11. perl ". __FILE__ . " /abstract/path/to/word.doc(x)\n";
  12.  
  13. if (!%ARGV){
  14. printf $usage;
  15. exit 1;
  16. }
  17.  
  18. my $File = $ARGV[0];
  19. my $FileLog = $File . ".txt";
  20. my $argc = @ARGV;
  21.  
  22. my $Word = Win32::OLE->new('Word.Application', 'Quit') or die "Couldn't run Word";
  23. if (!$Word->Documents){
  24. print "Word->Documents is unavailable.\n";
  25. exit 1;
  26. }
  27.  
  28. my $Doc = $Word->Documents->Open($File) or die "Cannot open file: $File.\n";
  29. my ($object, $paragraph, $enum);
  30.  
  31. # the whold contents of this Office Word file (*.doc(x))
  32. my @paras = ();
  33.  
  34. $enum = Win32::OLE::Enum->new($Doc->Paragraphs);
  35.  
  36. while(($object = $enum->Next)) {
  37. $paragraph = $object->Range->{Text};
  38. if (length($paragraph) < 2){
  39. next;
  40. }
  41. chomp($paragraph);
  42. $paragraph =~ s/\s//g;
  43. $paragraph =~ s/
  44. +$//g;
  45.  
  46. push(@paras, $paragraph);
  47. }
  48.  
  49. $Doc->Close;
  50. my $paras_count = @paras;
  51. if($paras_count){
  52. open FILELOG, ">$FileLog" or die "Cannot open log file: $FileLog\n";
  53. foreach my $para (@paras){
  54. print FILELOG $para, "\n";
  55. }
  56. close FILELOG;
  57. print "$File has been textlized to file $FileLog.\n";
  58. }else{
  59. print "Sorry buddy, I tried hard but still can not parse this ms office word file.\n";
  60. print "But I records the text in to ", $FileLog, " for your reference.\n";
  61. }
  62.  
  63. exit 0;

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.