Thursday, 12 December 2019

C#.NET - Large Files Search Processing with Boyer-Moore Searching Algorithm

The below class searches the given string in the specified file. This algorithm is implemented with Boyer-Moore. 

All you need to copy the below full class and call it as shown below.

long searchCount = FileGrep.GetMatchIndexes(filePath, "Hyderabad").Count;

This program can process huge files in seconds.
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.IO;
using System.Linq;
using System.Text;

namespace FileProcessor
{
    public class FileGrep
    {
        #region Public Methods
        /// 
        /// Return all matches of the pattern in specified file using the Boyer-Moore algorithm
        /// 
        /// The file to be searched
        /// The size of the buffer to use when reading the file
        /// IEnumerable which returns the indexes of pattern matches
        public static ReadOnlyCollection GetMatchIndexes(string filePath, string searchFor, int bufferSize = 1024 * 1024)
        {
            List matchIndexes = new List();

            if (string.IsNullOrEmpty(searchFor))
            {
                return new ReadOnlyCollection(matchIndexes);
            }

            FileInfo fileToSearch = new FileInfo(filePath);
            if (!fileToSearch.Exists)
            {
                throw new FileNotFoundException();
            }

            if (bufferSize  (Int32.MaxValue - (searchPattern.Length - 1)))
            {
                throw new ArgumentOutOfRangeException("bufferSize", bufferSize, string.Format("Size of the file buffer ({0}) plus the size of the search pattern minus one ({1}) may not exceed Int32.MaxValue ({2}).", bufferSize, (searchPattern.Length - 1), Int32.MaxValue));
            }

            using (FileStream stream = fileToSearch.OpenRead())
            {
                // Make sure that the file stream is seekable to access the data
                if (!stream.CanSeek)
                {
                    throw new Exception(String.Format("The file '{0}' is not seekable!  Search cannot be performed.", fileToSearch));
                }

                int chunkIndex = 0;
                while (true)
                {
                    byte[] fileData = GetNextChunkForSearch(stream, chunkIndex, bufferSize, searchPattern.Length);

                    if (fileData == null || !fileData.Any())
                    {
                        //EOF, so exit
                        break;
                    }

                    List occuranceIndexes = GetMatchIndexes_Internal(fileData, searchPattern, goodSuffixShift, badCharacterShift);

                    if (occuranceIndexes != null)
                    {
                        // We found one or more matches in our buffer.  Translate the buffer index
                        // back to the file index by adding the buffer size * the chunk index.
                        int bufferOffset = (bufferSize * chunkIndex);
                        matchIndexes.AddRange(occuranceIndexes.Select(bufferMatchIndex => (bufferMatchIndex + bufferOffset)));
                    }
                    chunkIndex++;
                } // end while
            }
            return new ReadOnlyCollection(matchIndexes);
        }

        #endregion Public Methods

        #region Helpers
        /// 
        /// Build the bad byte shift array.
        /// 
        /// Pattern for search
        /// Bad byte shift array
        private static long[] BuildBadCharacterShift(byte[] pattern)
        {
            long[] badCharacterShift = new long[256];
            long patternLength = Convert.ToInt64(pattern.Length);

            for (long c = 0; c 
        /// Find suffixes in the pattern
        /// 
        /// Pattern for search
        /// Suffix array
        private static long[] FindSuffixes(byte[] pattern)
        {
            long f = 0;

            long patternLength = Convert.ToInt64(pattern.Length);
            long[] suffixes = new long[pattern.Length + 1];

            suffixes[patternLength - 1] = patternLength;
            long g = patternLength - 1;
            for (long i = patternLength - 2; i >= 0; --i)
            {
                if (i > g && suffixes[i + patternLength - 1 - f] = 0 && (pattern[g] == pattern[g + patternLength - 1 - f]))
                    {
                        --g;
                    }
                    suffixes[i] = f - g;
                }
            }

            return suffixes;
        }
        /// 
        /// Build the good suffix array.
        /// 
        /// Pattern for search
        /// Suffixes in the pattern
        /// Good suffix shift array
        private static long[] BuildGoodSuffixShift(byte[] pattern, long[] suff)
        {
            long patternLength = Convert.ToInt64(pattern.Length);
            long[] goodSuffixShift = new long[pattern.Length + 1];

            for (long i = 0; i = -1; --i)
            {
                if (i == -1 || suff[i] == i + 1)
                {
                    for (; j 
        /// Gets the next chunk of bytes from the stream to perform the search.
        /// 
        /// The stream containing the file to search
        /// The index of the chunk to read from the file
        /// The size of the data chunk to read from the file
        /// The bytes read out of the stream
        private static byte[] GetNextChunkForSearch(Stream stream, int chunkIndex, int fileSearchBufferSize, int searchPatternLength)
        {
            byte[] chunk = null;

            long fileStartIndex = Convert.ToInt64(chunkIndex) * Convert.ToInt64(fileSearchBufferSize);
            if (fileStartIndex = searchPatternLength)
                {
                    // If we read less than the buffer length (end of file), then return a trimmed byte array.
                    if (numBytesRead 
        /// Return all matches of the pattern in specified data using the Boyer-Moore algorithm
        /// 
        /// The data to be searched
        /// List which returns the indexes of pattern matches
        private static List GetMatchIndexes_Internal(byte[] dataToSearch, byte[] searchPattern, long[] goodSuffixShift, long[] badCharacterShift)
        {
            List matchIndexes = new List();

            long patternLength = Convert.ToInt64(searchPattern.Length);
            long textLength = Convert.ToInt64(dataToSearch.Length);

            /* Searching */
            long index = 0;
            while (index = 0)
                {
                    if (searchPattern[unmatched] != dataToSearch[unmatched + index])
                    {
                        //pattern mismatch
                        index += Math.Max(goodSuffixShift[unmatched], badCharacterShift[dataToSearch[unmatched + index]] - patternLength + 1 + unmatched);
                        break;
                    }
                    unmatched--;
                }

                if (unmatched < 0)
                {
                    //match found
                    matchIndexes.Add(index);
                    index += goodSuffixShift[0];
                }
            }

            return matchIndexes;
        }
        #endregion Helpers
    }

}

No comments: