Index: openacs-4/packages/search/tcl/search-convert-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/search/tcl/search-convert-procs.tcl,v
diff -u -N -r1.5.2.6 -r1.5.2.7
--- openacs-4/packages/search/tcl/search-convert-procs.tcl	13 Dec 2022 14:53:33 -0000	1.5.2.6
+++ openacs-4/packages/search/tcl/search-convert-procs.tcl	15 Mar 2023 16:59:59 -0000	1.5.2.7
@@ -32,7 +32,7 @@
     set tmp_filename [ad_tmpnam]
     set result ""
 
-    switch $mime_type {
+    switch -glob $mime_type {
         application/msword -
         application/vnd.ms-word {
             set convert_command {catdoc $filename >$tmp_filename}
@@ -68,11 +68,45 @@
             }
             set convert_command {[util::which unzip] -p $filename content.xml >$tmp_filename}
         }
-        application/vnd.openxmlformats-officedocument.presentationml.presentation {
+        application/vnd.openxmlformats-officedocument.* {
             #
-            # File claims to be a MS pptx
+            # File claims to be a MS Office Open XML Format
             #
+            # Similar to ODF, these files are in fact a zip archive
+            # containing a directory structure that describes the
+            # document. The text content we are looking for is located
+            # in a specific path for every document type, but the
+            # principle is always the same: unzip the xml location
+            # from the archive and return it stripped of any markup.
+            #
 
+            switch $mime_type {
+                application/vnd.openxmlformats-officedocument.presentationml.presentation {
+                    #
+                    # PowerPoint .pptx
+                    #
+                    set xml_path ppt/slides/*.xml
+                }
+                application/vnd.openxmlformats-officedocument.spreadsheetml.sheet {
+                    #
+                    # Excel .xlsx
+                    #
+                    set xml_path xl/sharedStrings.xml
+                }
+                application/vnd.openxmlformats-officedocument.wordprocessingml.document {
+                    #
+                    # Word .docx
+                    #
+                    set xml_path word/document.xml
+                }
+                default {
+                    #
+                    # We do not support this file, exit.
+                    #
+                    return ""
+                }
+            }
+
             file delete -- $tmp_filename
 
             #
@@ -87,19 +121,14 @@
                 #
                 # Now we extract the markup from all slides...
                 #
-                set xml [exec -- [util::which unzip] -p $filename ppt/slides/*.xml]
+                set xml [exec -- [util::which unzip] -p $filename $xml_path]
                 #
                 # ... and clean it up so that only the plain text remains.
                 #
-                set txt ""
-                foreach {m t} [regexp -all -inline {<a:t>([^>]+)</a:t>} $xml] {
-                    lappend txt $t
-                }
+                return [string trim [ns_striphtml $xml]]
             } on error {errorMsg} {
                 ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg"
                 return ""
-            } on ok {d} {
-                return [join $txt]
             }
         }
         text/html {