Index: openacs-4/packages/search/tcl/search-convert-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/search/tcl/search-convert-procs.tcl,v diff -u -r1.7 -r1.8 --- openacs-4/packages/search/tcl/search-convert-procs.tcl 11 Sep 2024 06:15:53 -0000 1.7 +++ openacs-4/packages/search/tcl/search-convert-procs.tcl 20 Nov 2024 15:33:34 -0000 1.8 @@ -29,95 +29,98 @@ return "" } - set tmp_filename [ad_tmpnam] - set result "" + ad_try { - switch -glob $mime_type { - application/msword - - application/vnd.ms-word { - set convert_command {catdoc $filename >$tmp_filename} - } - application/msexcel - - application/vnd.ms-excel { - set convert_command {xls2csv $filename >$tmp_filename 2> /dev/null} - } - application/mspowerpoint - - application/vnd.ms-powerpoint { - set convert_command {catppt $filename >$tmp_filename} - } - application/pdf { - if {![util::file_content_check -type pdf -file $filename]} { - ns_log warning "search: $filename ($mime_type) is not a pdf file; skip indexing" - file delete -- $tmp_filename - return "" + switch -glob $mime_type { + application/msword - + application/vnd.ms-word { + return [exec -- catdoc $filename] } - set convert_command {pdftotext $filename $tmp_filename} - } - application/vnd.oasis.opendocument.text - - application/vnd.oasis.opendocument.text-template - - application/vnd.oasis.opendocument.text-web - - application/vnd.oasis.opendocument.text-master - - application/vnd.oasis.opendocument.presentation - - application/vnd.oasis.opendocument.presentation-template - - application/vnd.oasis.opendocument.spreadsheet - - application/vnd.oasis.opendocument.spreadsheet-template { - if {![util::file_content_check -type zip -file $filename]} { - ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing" - file delete -- $tmp_filename - return "" + application/msexcel - + application/vnd.ms-excel { + return [exec -ignorestderr -- xls2csv $filename] } - set convert_command {[util::which unzip] -p $filename content.xml >$tmp_filename} - } - application/vnd.openxmlformats-officedocument.* { - # - # File claims to be a MS Office Open XML Format - # - # Similar to ODF, these files are in fact a zip archive - # containing a directory structure that describes the - # document. The text content we are looking for is located - # in a specific path for every document type, but the - # principle is always the same: unzip the xml location - # from the archive and return it stripped of any markup. - # - - switch $mime_type { - application/vnd.openxmlformats-officedocument.presentationml.presentation { - # - # PowerPoint .pptx - # - set xml_path ppt/slides/*.xml + application/mspowerpoint - + application/vnd.ms-powerpoint { + return [exec -- catppt $filename] + } + application/pdf { + if {![util::file_content_check -type pdf -file $filename]} { + ns_log warning "search: $filename ($mime_type) is not a pdf file; skip indexing" + return "" + } else { + return [exec -- pdftotext $filename -] } - application/vnd.openxmlformats-officedocument.spreadsheetml.sheet { + } + application/vnd.oasis.opendocument.text - + application/vnd.oasis.opendocument.text-template - + application/vnd.oasis.opendocument.text-web - + application/vnd.oasis.opendocument.text-master - + application/vnd.oasis.opendocument.presentation - + application/vnd.oasis.opendocument.presentation-template - + application/vnd.oasis.opendocument.spreadsheet - + application/vnd.oasis.opendocument.spreadsheet-template { + if {![util::file_content_check -type zip -file $filename]} { + ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing" + return "" + } else { # - # Excel .xlsx + # Extract the markup... # - set xml_path xl/sharedStrings.xml - } - application/vnd.openxmlformats-officedocument.wordprocessingml.document { + set xml [exec -- [util::which unzip] -p $filename content.xml] # - # Word .docx + # ... and clean it up so that only the plain text remains. # - set xml_path word/document.xml + return [string trim [ns_striphtml $xml]] } - default { - # - # We do not support this file, exit. - # - return "" - } } + application/vnd.openxmlformats-officedocument.* { + # + # File claims to be a MS Office Open XML Format + # + # Similar to ODF, these files are in fact a zip archive + # containing a directory structure that describes the + # document. The text content we are looking for is located + # in a specific path for every document type, but the + # principle is always the same: unzip the xml location + # from the archive and return it stripped of any markup. + # - file delete -- $tmp_filename + switch $mime_type { + application/vnd.openxmlformats-officedocument.presentationml.presentation { + # + # PowerPoint .pptx + # + set xml_path ppt/slides/*.xml + } + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet { + # + # Excel .xlsx + # + set xml_path xl/sharedStrings.xml + } + application/vnd.openxmlformats-officedocument.wordprocessingml.document { + # + # Word .docx + # + set xml_path word/document.xml + } + default { + # + # We do not support this file, exit. + # + return "" + } + } - # - # First check that we can unzip the file - # - if {![util::file_content_check -type zip -file $filename]} { - ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing" - return "" - } + # + # First check that we can unzip the file + # + if {![util::file_content_check -type zip -file $filename]} { + ns_log warning "search: $filename ($mime_type) is not a zip file; skip indexing" + return "" + } - ad_try { # # Extract the markup... # @@ -126,78 +129,58 @@ # ... and clean it up so that only the plain text remains. # return [string trim [ns_striphtml $xml]] - } on error {errorMsg} { - ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg" - return "" } - } - text/html { - file delete -- $tmp_filename - # - # Reading the whole content into memory is not necessarily - # the best when dealing with huge files. However, for - # html-files this is probably ok. - # - return [ns_striphtml [template::util::read_file $filename]] - } - text/plain { - file delete -- $tmp_filename - # - # Don't trust blindly the extension and try to use the - # unix "file" command to get more info. - # - set file_command [::util::which file] - if {$file_command ne ""} { - set result [exec -ignorestderr $file_command --mime-type $filename] - set mime_type [lindex $result 1] + text/html { # - # Maybe, we are too restrictve by the following test, - # but let us be conservative first. + # Reading the whole content into memory is not necessarily + # the best when dealing with huge files. However, for + # html-files this is probably ok. # - if {$mime_type ne "text/plain"} { + return [ns_striphtml [template::util::read_file $filename]] + } + text/plain { + # + # Don't trust blindly the extension and try to use the + # unix "file" command to get more info. + # + set file_command [::util::which file] + if {$file_command ne ""} { + set result [exec -ignorestderr $file_command --mime-type $filename] + set mime_type [lindex $result 1] # - # The available file is not what it preteneds to - # be. We could try further to extract content, but - # we give simply up here. + # Maybe, we are too restrictve by the following test, + # but let us be conservative first. # - ns_log notice "search-convert: not a plain text file $result" - return "" + if {$mime_type ne "text/plain"} { + # + # The available file is not what it preteneds to + # be. We could try further to extract content, but + # we give simply up here. + # + ns_log notice "search-convert: not a plain text file $result" + return "" + } } + # + # Reading the whole content into memory is not necessarily + # the best when dealing with huge files. However, for + # txt-files this is probably ok. + # + return [template::util::read_file $filename] } - # - # Reading the whole content into memory is not necessarily - # the best when dealing with huge files. However, for - # txt-files this is probably ok. - # - return [template::util::read_file $filename] - } - default { - # If there's nothing implemented for a particular mime type - # we'll just index filename and pathname - return "" + default { + # If there's nothing implemented for a particular mime type + # we'll just index filename and pathname + return "" + } } - } - ad_try { - set convert_command [subst $convert_command] - exec -- {*}$convert_command } on error {errorMsg} { - if {$mime_type eq "application/pdf" && - [string first $errorMsg "Command Line Error: Incorrect password"] >= 0} { - ns_log warning "SEARCH: pdf seems password protected - $convert_command" - } else { - ns_log error "SEARCH: conversion failed - $convert_command: $errorMsg" - } - } on ok {d} { - set fd [open $tmp_filename "r"] - set result [read $fd] - close $fd - } finally { - file delete -- $tmp_filename + ns_log error "SEARCH: conversion failed - cannot extract text from $filename ($mime_type): $errorMsg" + return "" } - return $result }