Index: openacs-4/packages/search/tcl/search-convert-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/search/tcl/search-convert-procs.tcl,v diff -u -r1.5.2.3 -r1.5.2.4 --- openacs-4/packages/search/tcl/search-convert-procs.tcl 1 Mar 2022 09:18:44 -0000 1.5.2.3 +++ openacs-4/packages/search/tcl/search-convert-procs.tcl 13 Jun 2022 13:45:51 -0000 1.5.2.4 @@ -68,6 +68,40 @@ } set convert_command {unzip -p $filename content.xml >$tmp_filename} } + application/vnd.openxmlformats-officedocument.presentationml.presentation { + # + # File claims to be a MS pptx + # + + file delete -- $tmp_filename + + # + # First check that we can unzip the file + # + if {![util::file_content_check -type zip -file $filename]} { + ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing" + return "" + } + + ad_try { + # + # Now we extract the markup from all slides... + # + set xml [exec -- unzip -p $filename ppt/slides/*.xml] + # + # ... and clean it up so that only the plain text remains. + # + set txt "" + foreach {m t} [regexp -all -inline {([^>]+)} $xml] { + lappend txt $t + } + } on error {errorMsg} { + ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg" + return "" + } on ok {d} { + return [join $txt] + } + } text/html { file delete -- $tmp_filename #