0

入力した条件に基づいて共有ドライブを検索する検索ボックスをアプリケーションに追加しようとしています。私が現在持っているコードは次のとおりです。

Public Sub searchProcedure()

    Dim startFolder As String = "C:\Documents and Settings\Practice Search"

    Dim dir As New System.IO.DirectoryInfo(startFolder)
    Dim fileList = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories)

    Dim searchTerm = "test string"

    Dim queryMatchingFiles = From file In fileList _
                             Let fileText = GetFileText(file.FullName) _
                             Where fileText.Contains(searchTerm) _
                             Select file.FullName

    'Where file.Extension = "." _ (removed so searches all files)

    For Each filename In queryMatchingFiles
        ListBox1.Items.Add(filename)
    Next

End Sub


Function GetFileText(ByRef Name As String) As String

    Dim fileContents = String.Empty

    If System.IO.File.Exists(Name) Then

        fileContents = System.IO.File.ReadAllText(Name)

    End If

    Return fileContents

End Function

私が抱えている問題は、Microsoft Office ドキュメントに関するものです。内容は filecontents 文字列に読み込まれますが、内容は XML (?) です。

実際のテキスト コンテンツを検索用の文字列に渡す方法についてのアイデアはありますか?

ありがとう!

4

4 に答える 4

0

「すぐに使える」解決策はないという結論に達した。私は各ドキュメントタイプに取り組んでいます。OpenXML SDKを使用して、Wordから抽出するコードは次のとおりです。

Imports System.Xml.XmlReader
Imports System.IO
Imports DocumentFormat.OpenXml.Packaging
Imports DocumentFormat.OpenXml.Wordprocessing
Imports DocumentFormat.OpenXml.Spreadsheet
Imports DocumentFormat.OpenXml
Imports System.Linq

Public Sub WordProcessing()


    Dim strDoc As String = "C:\Documents and Settings\Practice.docx"
    Dim txt As String

    Dim stream As Stream = File.Open(strDoc, FileMode.Open)

    OpenAndAddtoWordProcessingStream(stream, txt)

    stream.Close()

    MessageBox.Show(txt)



End Sub

Public Sub OpenAndAddtoWordProcessingStream(ByVal stream As Stream, ByRef txt As String)


    Dim wordprocessingDocument As WordprocessingDocument = wordprocessingDocument.Open(stream, True)

    Dim body As Body = wordprocessingDocument.MainDocumentPart.Document.Body

    txt = body.InnerText.ToString

    wordprocessingDocument.Close()

End Sub

Excelから抽出するコードは次のとおりです。

  Dim strDoc As String = "C:\Documents and Settings\Practice.xlsx"
    Dim txt As String

    Dim spreadsheetDocument As SpreadsheetDocument = spreadsheetDocument.Open(strDoc, False)

    Dim workbookPart As WorkbookPart = spreadsheetDocument.WorkbookPart
    Dim shareStringPart As SharedStringTablePart = workbookPart.SharedStringTablePart


    For Each Item As SharedStringItem In shareStringPart.SharedStringTable.Elements(Of SharedStringItem)()

        MessageBox.Show(Item.InnerText)

    Next

次に、.PDF、Access、Powerpointについて調べます。

于 2013-03-22T12:15:34.177 に答える
0

コンテンツが正規表現を使用した XML または HTML の場合、タグを完全に取り除くことができます。

Regex.Replace(text, "<.*?>", "")

このような:

Dim fileContents = String.Empty

If System.IO.File.Exists(Name) Then

    fileContents = System.IO.File.ReadAllText(Name)
    fileContents = Regex.Replace(fileContents, "<.*?>", "")
End If

Return fileContents
于 2013-03-19T23:52:17.467 に答える
0

SSSの指示に従って質問が完全に回答されるように、これを追加しています。Office ドキュメント、office docs(x)、pdf、およびその他の一般的なファイル形式のテキスト文字列を検索するための完全なコードを次に示します。

Imports System.IO
Imports System.Xml.XmlReader
Imports DocumentFormat.OpenXml.Packaging
Imports DocumentFormat.OpenXml.Wordprocessing
Imports DocumentFormat.OpenXml.Spreadsheet
Imports DocumentFormat.OpenXml
Imports System.Linq
Imports System
Imports System.Collections.Generic
Imports A = DocumentFormat.OpenXml.Drawing
Imports DocumentFormat.OpenXml.Presentation
Imports System.Text
Imports iTextSharp.text
Imports iTextSharp.text.pdf

Module searchFiles

Public readAllText As String

Public Sub startSearch(ByVal searchText As String)

    MainForm.marketIntelligencelboxsearch.Items.Clear()

    Dim dir_info As New DirectoryInfo("\\Max1\dept\")

    ListFiles(MainForm.marketIntelligencelboxsearch, dir_info, searchText)

End Sub


Private Sub ListFiles(ByVal lst As ListView, ByVal dir_info As DirectoryInfo, ByVal target As String)
    ' Get the files in this directory.
    Dim fs_infos() As FileInfo = dir_info.GetFiles("*.*")
    For Each fs_info As FileInfo In fs_infos
        If target = "ALL" Or fs_info.ToString().IndexOf(target, StringComparison.OrdinalIgnoreCase) >= 0 Then
            MainForm.marketIntelligencelboxsearch.Items.Add(System.IO.Path.GetFileName(fs_info.FullName), MainForm.sourceFileImageIndex(fs_info.FullName))
        Else

            readAllText = File.ReadAllText(fs_info.FullName)

            If fileExtention(fs_info.FullName, target) <> 0 Then
                MainForm.marketIntelligencelboxsearch.Items.Add(System.IO.Path.GetFileName(fs_info.FullName), MainForm.sourceFileImageIndex(fs_info.FullName))
            End If
        End If
    Next fs_info
    fs_infos = Nothing

    ' Search subdirectories.
    Dim subdirs() As DirectoryInfo = dir_info.GetDirectories()
    For Each subdir As DirectoryInfo In subdirs
        ListFiles(lst, subdir, target)
    Next subdir
End Sub


Public Function fileExtention(ByVal sourcePath As String, ByVal target As String) As Integer

    Dim searchResult As Integer

    Select Case True

        Case InStr(sourcePath, ".docx") <> 0 Or InStr(sourcePath, ".docm")
            searchResult = WordProcessing(sourcePath, target)
            Return searchResult

        Case InStr(LCase(sourcePath), ".xlsx") <> 0 Or InStr(LCase(sourcePath), ".xlsm") <> 0
            searchResult = ExcelProcessing(sourcePath, target)
            Return searchResult

        Case InStr(LCase(sourcePath), ".pptx") <> 0 Or InStr(LCase(sourcePath), ".pptm") <> 0
            'will read slide text and notes
            searchResult = PowerpointProcessing(sourcePath, target)
            Return searchResult

        Case InStr(LCase(sourcePath), ".pdf") <> 0
            'will search text in pdf
            searchResult = pdfProcesssing(sourcePath, target)
            Return searchResult

        Case Else
            'looks at office docs before 2007 and all other generic  extensions, includes Access 2007 and lower
            searchResult = catchallProcessing(readAllText, target)
            Return searchResult
    End Select


End Function

領域「検索インデックス」

Public Function catchallProcessing(ByVal strDoc As String, ByVal target As String) As Integer

    If Not (strDoc) Is Nothing Then
        If strDoc.IndexOf(target, StringComparison.OrdinalIgnoreCase) >= 0 Then 'means it ignores the case, no indexof = searching inside
            Return 1

        Else

            Return 0

        End If
    Else

        Return 0
    End If

End Function

エンドリージョン

領域「Word 2007 処理」

Public Function WordProcessing(ByVal strDoc As String, ByVal target As String) As Integer  ' Word 2007 and Higher

    Dim txt As String

    Dim stream As Stream = File.Open(strDoc, FileMode.Open)

    Dim wordprocessingDocument As WordprocessingDocument = wordprocessingDocument.Open(stream, True)

    Dim body As Body = wordprocessingDocument.MainDocumentPart.Document.Body

    txt = body.InnerText.ToString
    Return catchallProcessing(txt, target) 'should return 0 or 1

    wordprocessingDocument.Close()
    stream.Close()

End Function

エンドリージョン

領域「Excel 2007 処理」

Public Function ExcelProcessing(ByVal strDoc As String, ByVal target As String) As Integer 'Excel 2007 and Higher

    Dim spreadsheetDocument As SpreadsheetDocument = spreadsheetDocument.Open(strDoc, False)

    Dim workbookPart As WorkbookPart = spreadsheetDocument.WorkbookPart
    Dim shareStringPart As SharedStringTablePart = workbookPart.SharedStringTablePart

    Dim paragraphText As New StringBuilder()

    For Each Item As SharedStringItem In shareStringPart.SharedStringTable.Elements(Of SharedStringItem)()

        paragraphText.Append(Item.InnerText) 'should read all strings

    Next

    Return catchallProcessing(paragraphText.ToString(), target)

End Function

エンドリージョン

領域「Powerpoint 2007 処理」

Public Function PowerpointProcessing(ByVal file As String, ByVal target As String) As Integer

    Dim numberOfSlides As Integer = CountSlides(file)

    Dim slideText As String = Nothing
    Dim totalText As String = Nothing

    For i As Integer = 0 To numberOfSlides - 1
        GetSlideIdandText(slideText, file, i)
        totalText = totalText & slideText
        'System.Console.WriteLine("Slide #{0} contains: {1}", i + 1, slideText)
    Next

    Return catchallProcessing(totalText, target)

End Function

Public Function CountSlides(ByVal presentationFile As String) As Integer

    Using powerpointDocument As PresentationDocument = PresentationDocument.Open(presentationFile, False)

        Return CountSlides(powerpointDocument)

    End Using


End Function

Public Function CountSlides(ByVal powerpointDocument As PresentationDocument) As Integer


    If powerpointDocument Is Nothing Then

        Throw New ArgumentNullException("presentationDocument")

    End If

    Dim slidesCount As Integer = 0

    Dim presentationPart As PresentationPart = powerpointDocument.PresentationPart

    If presentationPart IsNot Nothing Then

        slidesCount = presentationPart.SlideParts.Count()

    End If

    Return slidesCount

End Function

Public Function GetSlideIdandText(ByRef sldText As String, ByVal docName As String, ByVal index As Integer)


    Using ppt As PresentationDocument = PresentationDocument.Open(docName, False)

        Dim part As PresentationPart = ppt.PresentationPart
        Dim slideIDs As OpenXmlElementList = part.Presentation.SlideIdList.ChildElements
        Dim relID As String = TryCast(slideIDs(index), SlideId).RelationshipId


        Dim slide As SlidePart = DirectCast(part.GetPartById(relID), SlidePart)
        Dim notesSlide As NotesSlidePart = slide.NotesSlidePart
        Dim sn As NotesSlide = notesSlide.NotesSlide


        Dim textx As IEnumerable(Of A.Text) = sn.Descendants(Of A.Text)()
        Dim notesText As New StringBuilder()

        For Each text As A.Text In textx

            notesText.Append(text.Text)

        Next


        Dim paragraphText As New StringBuilder()

        Dim texts As IEnumerable(Of A.Text) = slide.Slide.Descendants(Of A.Text)()

        For Each text As A.Text In texts
            paragraphText.Append(text.Text)
        Next

        sldText = paragraphText.ToString() & notesText.ToString() 'concatenates the notes and slide text for searching

    End Using


End Function

エンドリージョン

リージョン「PDF処理」

Public Function pdfProcesssing(ByVal strDoc As String, ByVal target As String) As Integer


    Dim oReader As New iTextSharp.text.pdf.PdfReader(strDoc)
    Dim stringOut As StringBuilder = New StringBuilder()

    If File.Exists(strDoc) Then


        For i = 1 To oReader.NumberOfPages

            Dim itsText As New iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy
            stringOut.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(oReader, i, itsText))

        Next


    End If

    Return catchallProcessing(stringOut.ToString(), target)

End Function

エンドリージョン

End Module
于 2013-03-29T18:43:42.987 に答える