Index: openacs-4/packages/search/tcl/search-convert-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/search/tcl/search-convert-procs.tcl,v
diff -u -r1.7 -r1.8
--- openacs-4/packages/search/tcl/search-convert-procs.tcl	11 Sep 2024 06:15:53 -0000	1.7
+++ openacs-4/packages/search/tcl/search-convert-procs.tcl	20 Nov 2024 15:33:34 -0000	1.8
@@ -29,95 +29,98 @@
         return ""
     }
 
-    set tmp_filename [ad_tmpnam]
-    set result ""
+    ad_try {
 
-    switch -glob $mime_type {
-        application/msword -
-        application/vnd.ms-word {
-            set convert_command {catdoc $filename >$tmp_filename}
-        }
-        application/msexcel -
-        application/vnd.ms-excel {
-            set convert_command {xls2csv $filename >$tmp_filename 2> /dev/null}
-        }
-        application/mspowerpoint -
-        application/vnd.ms-powerpoint {
-            set convert_command {catppt $filename >$tmp_filename}
-        }
-        application/pdf {
-            if {![util::file_content_check -type pdf -file $filename]} {
-                ns_log warning "search: $filename ($mime_type) is not a pdf file; skip indexing"
-                file delete -- $tmp_filename
-                return ""
+        switch -glob $mime_type {
+            application/msword -
+            application/vnd.ms-word {
+                return [exec -- catdoc $filename]
             }
-            set convert_command {pdftotext $filename $tmp_filename}
-        }
-        application/vnd.oasis.opendocument.text -
-        application/vnd.oasis.opendocument.text-template -
-        application/vnd.oasis.opendocument.text-web -
-        application/vnd.oasis.opendocument.text-master -
-        application/vnd.oasis.opendocument.presentation -
-        application/vnd.oasis.opendocument.presentation-template -
-        application/vnd.oasis.opendocument.spreadsheet -
-        application/vnd.oasis.opendocument.spreadsheet-template {
-            if {![util::file_content_check -type zip -file $filename]} {
-                ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing"
-                file delete -- $tmp_filename
-                return ""
+            application/msexcel -
+            application/vnd.ms-excel {
+                return [exec -ignorestderr -- xls2csv $filename]
             }
-            set convert_command {[util::which unzip] -p $filename content.xml >$tmp_filename}
-        }
-        application/vnd.openxmlformats-officedocument.* {
-            #
-            # File claims to be a MS Office Open XML Format
-            #
-            # Similar to ODF, these files are in fact a zip archive
-            # containing a directory structure that describes the
-            # document. The text content we are looking for is located
-            # in a specific path for every document type, but the
-            # principle is always the same: unzip the xml location
-            # from the archive and return it stripped of any markup.
-            #
-
-            switch $mime_type {
-                application/vnd.openxmlformats-officedocument.presentationml.presentation {
-                    #
-                    # PowerPoint .pptx
-                    #
-                    set xml_path ppt/slides/*.xml
+            application/mspowerpoint -
+            application/vnd.ms-powerpoint {
+                return [exec -- catppt $filename]
+            }
+            application/pdf {
+                if {![util::file_content_check -type pdf -file $filename]} {
+                    ns_log warning "search: $filename ($mime_type) is not a pdf file; skip indexing"
+                    return ""
+                } else {
+                    return [exec -- pdftotext $filename -]
                 }
-                application/vnd.openxmlformats-officedocument.spreadsheetml.sheet {
+            }
+            application/vnd.oasis.opendocument.text -
+            application/vnd.oasis.opendocument.text-template -
+            application/vnd.oasis.opendocument.text-web -
+            application/vnd.oasis.opendocument.text-master -
+            application/vnd.oasis.opendocument.presentation -
+            application/vnd.oasis.opendocument.presentation-template -
+            application/vnd.oasis.opendocument.spreadsheet -
+            application/vnd.oasis.opendocument.spreadsheet-template {
+                if {![util::file_content_check -type zip -file $filename]} {
+                    ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing"
+                    return ""
+                } else {
                     #
-                    # Excel .xlsx
+                    # Extract the markup...
                     #
-                    set xml_path xl/sharedStrings.xml
-                }
-                application/vnd.openxmlformats-officedocument.wordprocessingml.document {
+                    set xml [exec -- [util::which unzip] -p $filename content.xml]
                     #
-                    # Word .docx
+                    # ... and clean it up so that only the plain text remains.
                     #
-                    set xml_path word/document.xml
+                    return [string trim [ns_striphtml $xml]]
                 }
-                default {
-                    #
-                    # We do not support this file, exit.
-                    #
-                    return ""
-                }
             }
+            application/vnd.openxmlformats-officedocument.* {
+                #
+                # File claims to be a MS Office Open XML Format
+                #
+                # Similar to ODF, these files are in fact a zip archive
+                # containing a directory structure that describes the
+                # document. The text content we are looking for is located
+                # in a specific path for every document type, but the
+                # principle is always the same: unzip the xml location
+                # from the archive and return it stripped of any markup.
+                #
 
-            file delete -- $tmp_filename
+                switch $mime_type {
+                    application/vnd.openxmlformats-officedocument.presentationml.presentation {
+                        #
+                        # PowerPoint .pptx
+                        #
+                        set xml_path ppt/slides/*.xml
+                    }
+                    application/vnd.openxmlformats-officedocument.spreadsheetml.sheet {
+                        #
+                        # Excel .xlsx
+                        #
+                        set xml_path xl/sharedStrings.xml
+                    }
+                    application/vnd.openxmlformats-officedocument.wordprocessingml.document {
+                        #
+                        # Word .docx
+                        #
+                        set xml_path word/document.xml
+                    }
+                    default {
+                        #
+                        # We do not support this file, exit.
+                        #
+                        return ""
+                    }
+                }
 
-            #
-            # First check that we can unzip the file
-            #
-            if {![util::file_content_check -type zip -file $filename]} {
-                ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing"
-                return ""
-            }
+                #
+                # First check that we can unzip the file
+                #
+                if {![util::file_content_check -type zip -file $filename]} {
+                    ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing"
+                    return ""
+                }
 
-            ad_try {
                 #
                 # Extract the markup...
                 #
@@ -126,78 +129,58 @@
                 # ... and clean it up so that only the plain text remains.
                 #
                 return [string trim [ns_striphtml $xml]]
-            } on error {errorMsg} {
-                ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg"
-                return ""
             }
-        }
-        text/html {
-            file delete -- $tmp_filename
-            #
-            # Reading the whole content into memory is not necessarily
-            # the best when dealing with huge files. However, for
-            # html-files this is probably ok.
-            #
-            return [ns_striphtml [template::util::read_file $filename]]
-        }
-        text/plain {
-            file delete -- $tmp_filename
-            #
-            # Don't trust blindly the extension and try to use the
-            # unix "file" command to get more info.
-            #
-            set file_command [::util::which file]
-            if {$file_command ne ""} {
-                set result [exec -ignorestderr $file_command --mime-type $filename]
-                set mime_type [lindex $result 1]
+            text/html {
                 #
-                # Maybe, we are too restrictve by the following test,
-                # but let us be conservative first.
+                # Reading the whole content into memory is not necessarily
+                # the best when dealing with huge files. However, for
+                # html-files this is probably ok.
                 #
-                if {$mime_type ne "text/plain"} {
+                return [ns_striphtml [template::util::read_file $filename]]
+            }
+            text/plain {
+                #
+                # Don't trust blindly the extension and try to use the
+                # unix "file" command to get more info.
+                #
+                set file_command [::util::which file]
+                if {$file_command ne ""} {
+                    set result [exec -ignorestderr $file_command --mime-type $filename]
+                    set mime_type [lindex $result 1]
                     #
-                    # The available file is not what it preteneds to
-                    # be. We could try further to extract content, but
-                    # we give simply up here.
+                    # Maybe, we are too restrictve by the following test,
+                    # but let us be conservative first.
                     #
-                    ns_log notice "search-convert: not a plain text file $result"
-                    return ""
+                    if {$mime_type ne "text/plain"} {
+                        #
+                        # The available file is not what it preteneds to
+                        # be. We could try further to extract content, but
+                        # we give simply up here.
+                        #
+                        ns_log notice "search-convert: not a plain text file $result"
+                        return ""
+                    }
                 }
+                #
+                # Reading the whole content into memory is not necessarily
+                # the best when dealing with huge files. However, for
+                # txt-files this is probably ok.
+                #
+                return [template::util::read_file $filename]
             }
-            #
-            # Reading the whole content into memory is not necessarily
-            # the best when dealing with huge files. However, for
-            # txt-files this is probably ok.
-            #
-            return [template::util::read_file $filename]
-        }
 
-        default {
-            # If there's nothing implemented for a particular mime type
-            # we'll just index filename and pathname
-            return ""
+            default {
+                # If there's nothing implemented for a particular mime type
+                # we'll just index filename and pathname
+                return ""
+            }
         }
-    }
 
-    ad_try {
-        set convert_command [subst $convert_command]
-        exec -- {*}$convert_command
     } on error {errorMsg} {
-        if {$mime_type eq "application/pdf" &&
-            [string first $errorMsg "Command Line Error: Incorrect password"] >= 0} {
-            ns_log warning "SEARCH: pdf seems password protected - $convert_command"
-        } else {
-            ns_log error "SEARCH: conversion failed - $convert_command: $errorMsg"
-        }
-    } on ok {d} {
-        set fd [open $tmp_filename "r"]
-        set result [read $fd]
-        close $fd
-    } finally {
-        file delete -- $tmp_filename
+        ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg"
+        return ""
     }
 
-    return $result
 }