Index: openacs-4/packages/search/tcl/search-convert-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/search/tcl/search-convert-procs.tcl,v
diff -u -r1.5 -r1.6
--- openacs-4/packages/search/tcl/search-convert-procs.tcl	7 Aug 2017 23:48:29 -0000	1.5
+++ openacs-4/packages/search/tcl/search-convert-procs.tcl	3 Sep 2024 15:37:54 -0000	1.6
@@ -32,7 +32,7 @@
     set tmp_filename [ad_tmpnam]
     set result ""
 
-    switch $mime_type {
+    switch -glob $mime_type {
         application/msword -
         application/vnd.ms-word {
             set convert_command {catdoc $filename >$tmp_filename}
@@ -46,7 +46,12 @@
             set convert_command {catppt $filename >$tmp_filename}
         }
         application/pdf {
-            set convert_command {pdftotext -q $filename $tmp_filename}
+            if {![util::file_content_check -type pdf -file $filename]} {
+                ns_log warning "search: $filename ($mime_type) is not a pdf file; skip indexing"
+                file delete -- $tmp_filename
+                return ""
+            }
+            set convert_command {pdftotext $filename $tmp_filename}
         }
         application/vnd.oasis.opendocument.text -
         application/vnd.oasis.opendocument.text-template -
@@ -56,25 +61,115 @@
         application/vnd.oasis.opendocument.presentation-template -
         application/vnd.oasis.opendocument.spreadsheet -
         application/vnd.oasis.opendocument.spreadsheet-template {
-            set convert_command {unzip -p $filename content.xml >$tmp_filename}
+            if {![util::file_content_check -type zip -file $filename]} {
+                ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing"
+                file delete -- $tmp_filename
+                return ""
+            }
+            set convert_command {[util::which unzip] -p $filename content.xml >$tmp_filename}
         }
+        application/vnd.openxmlformats-officedocument.* {
+            #
+            # File claims to be a MS Office Open XML Format
+            #
+            # Similar to ODF, these files are in fact a zip archive
+            # containing a directory structure that describes the
+            # document. The text content we are looking for is located
+            # in a specific path for every document type, but the
+            # principle is always the same: unzip the xml location
+            # from the archive and return it stripped of any markup.
+            #
+
+            switch $mime_type {
+                application/vnd.openxmlformats-officedocument.presentationml.presentation {
+                    #
+                    # PowerPoint .pptx
+                    #
+                    set xml_path ppt/slides/*.xml
+                }
+                application/vnd.openxmlformats-officedocument.spreadsheetml.sheet {
+                    #
+                    # Excel .xlsx
+                    #
+                    set xml_path xl/sharedStrings.xml
+                }
+                application/vnd.openxmlformats-officedocument.wordprocessingml.document {
+                    #
+                    # Word .docx
+                    #
+                    set xml_path word/document.xml
+                }
+                default {
+                    #
+                    # We do not support this file, exit.
+                    #
+                    return ""
+                }
+            }
+
+            file delete -- $tmp_filename
+
+            #
+            # First check that we can unzip the file
+            #
+            if {![util::file_content_check -type zip -file $filename]} {
+                ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing"
+                return ""
+            }
+
+            ad_try {
+                #
+                # Extract the markup...
+                #
+                set xml [exec -- [util::which unzip] -p $filename $xml_path]
+                #
+                # ... and clean it up so that only the plain text remains.
+                #
+                return [string trim [ns_striphtml $xml]]
+            } on error {errorMsg} {
+                ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg"
+                return ""
+            }
+        }
         text/html {
-	    file delete -- $tmp_filename
-	    #
-	    # Reading the whole content into memory is not necessarily
-	    # the best when dealing with huge files. However, for
-	    # html-files this is probably ok.
-	    #
+            file delete -- $tmp_filename
+            #
+            # Reading the whole content into memory is not necessarily
+            # the best when dealing with huge files. However, for
+            # html-files this is probably ok.
+            #
             return [ns_striphtml [template::util::read_file $filename]]
         }
         text/plain {
-	    file delete -- $tmp_filename
-	    #
-	    # Reading the whole content into memory is not necessarily
-	    # the best when dealing with huge files. However, for
-	    # txt-files this is probably ok.
-	    #
-	    return [template::util::read_file $filename]
+            file delete -- $tmp_filename
+            #
+            # Don't trust blindly the extension and try to use the
+            # unix "file" command to get more info.
+            #
+            set file_command [::util::which file]
+            if {$file_command ne ""} {
+                set result [exec -ignorestderr $file_command --mime-type $filename]
+                set mime_type [lindex $result 1]
+                #
+                # Maybe, we are too restrictve by the following test,
+                # but let us be conservative first.
+                #
+                if {$mime_type ne "text/plain"} {
+                    #
+                    # The available file is not what it preteneds to
+                    # be. We could try further to extract content, but
+                    # we give simply up here.
+                    #
+                    ns_log notice "search-convert: not a plain text file $result"
+                    return ""
+                }
+            }
+            #
+            # Reading the whole content into memory is not necessarily
+            # the best when dealing with huge files. However, for
+            # txt-files this is probably ok.
+            #
+            return [template::util::read_file $filename]
         }
 
         default {
@@ -84,16 +179,24 @@
         }
     }
 
-    if {[catch {eval exec $convert_command} err]} {
-        catch {file delete -- $tmp_filename}
-        ns_log Error "SEARCH: conversion failed - $convert_command: $err"
-        return
+    ad_try {
+        set convert_command [subst $convert_command]
+        exec -- {*}$convert_command
+    } on error {errorMsg} {
+        if {$mime_type eq "application/pdf" &&
+            [string first $errorMsg "Command Line Error: Incorrect password"] >= 0} {
+            ns_log warning "SEARCH: pdf seems password protected - $convert_command"
+        } else {
+            ns_log error "SEARCH: conversion failed - $convert_command: $errorMsg"
+        }
+    } on ok {d} {
+        set fd [open $tmp_filename "r"]
+        set result [read $fd]
+        close $fd
+    } finally {
+        file delete -- $tmp_filename
     }
 
-    set fd [open $tmp_filename "r"]
-    set result [read $fd]
-    close $fd
-    file delete -- $tmp_filename
     return $result
 }