2017-08-14 36 views
0

我使用PHP使用下面的代碼來提取,以圖片和文字解析的docx得到的文本和圖像 -PHP從DOCX

$zip = zip_open($filename); 
    if (!$zip || is_numeric($zip)) return false; 

    while ($zip_entry = zip_read($zip)) { 

     if (zip_entry_open($zip, $zip_entry) == FALSE) continue; 

     $zipEntryName = zip_entry_name($zip_entry); 
     /*if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zipEntryName)) 
     { 
      echo zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); 
     }*/ 
     if (strpos($zipEntryName, 'word/media') !== false) 
     { 
      # Removes 'word/media' prefix 
      $imageName = substr($zipEntryName, 11); 

      # Prevent EMF file extensions passing, as they are used by word rather than being manually placed 
      if (substr($imageName, -3) == 'emf') continue; 

      # Place the image assets into an array for future reference 
      $imageAssets[$imageName] = array(
       'h' => 'auto', 
       'w' => 'auto', 
       'title' => $imageName, 
       'id' => null, 
       'data' => base64_encode(zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)))); 
     } 

     if ($zipEntryName != "word/document.xml") continue; 

     $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); 

     zip_entry_close($zip_entry); 
    } 
    zip_close($zip); 
    $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content); 
    $content = str_replace('</w:r></w:p>', "\r\n", $content); 
    $content = str_replace("\r\n", "\n", $content); 
    $striped_content = strip_tags($content); 

我存儲在imageAssets陣列的文件。剝離的內容包含整個文本以及轉換爲隨機數的圖像。如何將此編號映射到正確的圖像。

+1

[多少研究工作,預計堆棧溢出用戶](https://meta.stackoverflow.com/a/261593/5827005) – GrumpyCrouton

+0

看看這個解決方案 https://stackoverflow.com/questions/19503653/how-to-extract-text-from-word-file -doc,DOCX,XLSX,PPTX,PHP –

回答

0

**試試這個代碼**

$zip2 = new ZipArchive; 
$zip2->open($filename); 
$zip = zip_open($filename); 
$zip2->open($filename); 
$i=0; 
if (!$zip || is_numeric($zip)) return false; 

while ($zip_entry = zip_read($zip)) { 

    if (zip_entry_open($zip, $zip_entry) == FALSE) continue; 

    $zipEntryName = zip_entry_name($zip_entry); 
    if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zipEntryName)) 
    { 
     // echo zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); 
     $dataIn = $zip2->statIndex($i); 
     $zip_element = $zip2->statIndex($i); 

     $index = $zip_element['index']; 
     echo "<image src='display.php?filename=".$filename."&index=".$index."' ><br />"; 
    } 
    $i++; 

    if (strpos($zipEntryName, 'word/media') !== false) 
    { 
     # Removes 'word/media' prefix 
    $imageName = substr($zipEntryName, 11); 

     # Prevent EMF file extensions passing, as they are used by word rather than being manually placed 
     if (substr($imageName, -3) == 'emf') continue; 

     # Place the image assets into an array for future reference 
     $imageAssets[$imageName] = array(
      'h' => 'auto', 
      'w' => 'auto', 
      'title' => $imageName, 
      'id' => null, 
      'data' => base64_encode(zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)))); 
    } 

    if ($zipEntryName != "word/document.xml") continue; 

    $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); 

    zip_entry_close($zip_entry); 

} 

zip_close($zip); 
$content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content); 
$content = str_replace('</w:r></w:p>', "\r\n", $content); 
$content = str_replace("\r\n", "\n", $content); 
$striped_content = strip_tags($content); 

和添加新的文件圖像顯示(Display.php的)相同的文件夾

<?php 

    /*Tell the browser that we want to display an image*/ 
    header('Content-Type: image/jpeg'); 

    /*Create a new ZIP archive object*/ 
    $zip = new ZipArchive; 

    /*Open the received archive file*/ 
    if (true === $zip->open($_GET['filename'])) { 

     /*Get the content of the specified index of ZIP archive*/ 
     echo $zip->getFromIndex($_GET['index']); 

    } 

    $zip->close(); 
    ?>