The below class searches the given string in the specified file. This algorithm is implemented with Boyer-Moore. 
All you need to copy the below full class and call it as shown below.
long searchCount = FileGrep.GetMatchIndexes(filePath, "Hyderabad").Count;
This program can process huge files in seconds.
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.IO;
using System.Linq;
using System.Text;
namespace FileProcessor
{
    public class FileGrep
    {
        #region Public Methods
                                                        public static ReadOnlyCollection GetMatchIndexes(string filePath, string searchFor, int bufferSize = 1024 * 1024)
        {
            List matchIndexes = new List();
            if (string.IsNullOrEmpty(searchFor))
            {
                return new ReadOnlyCollection(matchIndexes);
            }
            FileInfo fileToSearch = new FileInfo(filePath);
            if (!fileToSearch.Exists)
            {
                throw new FileNotFoundException();
            }
            if (bufferSize  (Int32.MaxValue - (searchPattern.Length - 1)))
            {
                throw new ArgumentOutOfRangeException("bufferSize", bufferSize, string.Format("Size of the file buffer ({0}) plus the size of the search pattern minus one ({1}) may not exceed Int32.MaxValue ({2}).", bufferSize, (searchPattern.Length - 1), Int32.MaxValue));
            }
            using (FileStream stream = fileToSearch.OpenRead())
            {
                
                if (!stream.CanSeek)
                {
                    throw new Exception(String.Format("The file '{0}' is not seekable!  Search cannot be performed.", fileToSearch));
                }
                int chunkIndex = 0;
                while (true)
                {
                    byte[] fileData = GetNextChunkForSearch(stream, chunkIndex, bufferSize, searchPattern.Length);
                    if (fileData == null || !fileData.Any())
                    {
                        
                        break;
                    }
                    List occuranceIndexes = GetMatchIndexes_Internal(fileData, searchPattern, goodSuffixShift, badCharacterShift);
                    if (occuranceIndexes != null)
                    {
                        
                        
                        int bufferOffset = (bufferSize * chunkIndex);
                        matchIndexes.AddRange(occuranceIndexes.Select(bufferMatchIndex => (bufferMatchIndex + bufferOffset)));
                    }
                    chunkIndex++;
                } 
            }
            return new ReadOnlyCollection(matchIndexes);
        }
        #endregion Public Methods
        #region Helpers
                                                private static long[] BuildBadCharacterShift(byte[] pattern)
        {
            long[] badCharacterShift = new long[256];
            long patternLength = Convert.ToInt64(pattern.Length);
            for (long c = 0; c 
                                        private static long[] FindSuffixes(byte[] pattern)
        {
            long f = 0;
            long patternLength = Convert.ToInt64(pattern.Length);
            long[] suffixes = new long[pattern.Length + 1];
            suffixes[patternLength - 1] = patternLength;
            long g = patternLength - 1;
            for (long i = patternLength - 2; i >= 0; --i)
            {
                if (i > g && suffixes[i + patternLength - 1 - f] = 0 && (pattern[g] == pattern[g + patternLength - 1 - f]))
                    {
                        --g;
                    }
                    suffixes[i] = f - g;
                }
            }
            return suffixes;
        }
                                                        private static long[] BuildGoodSuffixShift(byte[] pattern, long[] suff)
        {
            long patternLength = Convert.ToInt64(pattern.Length);
            long[] goodSuffixShift = new long[pattern.Length + 1];
            for (long i = 0; i = -1; --i)
            {
                if (i == -1 || suff[i] == i + 1)
                {
                    for (; j 
                                                        private static byte[] GetNextChunkForSearch(Stream stream, int chunkIndex, int fileSearchBufferSize, int searchPatternLength)
        {
            byte[] chunk = null;
            long fileStartIndex = Convert.ToInt64(chunkIndex) * Convert.ToInt64(fileSearchBufferSize);
            if (fileStartIndex = searchPatternLength)
                {
                    
                    if (numBytesRead 
                                        private static List GetMatchIndexes_Internal(byte[] dataToSearch, byte[] searchPattern, long[] goodSuffixShift, long[] badCharacterShift)
        {
            List matchIndexes = new List();
            long patternLength = Convert.ToInt64(searchPattern.Length);
            long textLength = Convert.ToInt64(dataToSearch.Length);
            
            long index = 0;
            while (index = 0)
                {
                    if (searchPattern[unmatched] != dataToSearch[unmatched + index])
                    {
                        
                        index += Math.Max(goodSuffixShift[unmatched], badCharacterShift[dataToSearch[unmatched + index]] - patternLength + 1 + unmatched);
                        break;
                    }
                    unmatched--;
                }
                if (unmatched < 0)
                {
                    
                    matchIndexes.Add(index);
                    index += goodSuffixShift[0];
                }
            }
            return matchIndexes;
        }
        #endregion Helpers
    }
}
 
 
No comments:
Post a Comment