The below class searches the given string in the specified file. This algorithm is implemented with Boyer-Moore.
All you need to copy the below full class and call it as shown below.
long searchCount = FileGrep.GetMatchIndexes(filePath, "Hyderabad").Count;
This program can process huge files in seconds.
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.IO;
using System.Linq;
using System.Text;
namespace FileProcessor
{
public class FileGrep
{
#region Public Methods
public static ReadOnlyCollection GetMatchIndexes(string filePath, string searchFor, int bufferSize = 1024 * 1024)
{
List matchIndexes = new List();
if (string.IsNullOrEmpty(searchFor))
{
return new ReadOnlyCollection(matchIndexes);
}
FileInfo fileToSearch = new FileInfo(filePath);
if (!fileToSearch.Exists)
{
throw new FileNotFoundException();
}
if (bufferSize (Int32.MaxValue - (searchPattern.Length - 1)))
{
throw new ArgumentOutOfRangeException("bufferSize", bufferSize, string.Format("Size of the file buffer ({0}) plus the size of the search pattern minus one ({1}) may not exceed Int32.MaxValue ({2}).", bufferSize, (searchPattern.Length - 1), Int32.MaxValue));
}
using (FileStream stream = fileToSearch.OpenRead())
{
if (!stream.CanSeek)
{
throw new Exception(String.Format("The file '{0}' is not seekable! Search cannot be performed.", fileToSearch));
}
int chunkIndex = 0;
while (true)
{
byte[] fileData = GetNextChunkForSearch(stream, chunkIndex, bufferSize, searchPattern.Length);
if (fileData == null || !fileData.Any())
{
break;
}
List occuranceIndexes = GetMatchIndexes_Internal(fileData, searchPattern, goodSuffixShift, badCharacterShift);
if (occuranceIndexes != null)
{
int bufferOffset = (bufferSize * chunkIndex);
matchIndexes.AddRange(occuranceIndexes.Select(bufferMatchIndex => (bufferMatchIndex + bufferOffset)));
}
chunkIndex++;
}
}
return new ReadOnlyCollection(matchIndexes);
}
#endregion Public Methods
#region Helpers
private static long[] BuildBadCharacterShift(byte[] pattern)
{
long[] badCharacterShift = new long[256];
long patternLength = Convert.ToInt64(pattern.Length);
for (long c = 0; c
private static long[] FindSuffixes(byte[] pattern)
{
long f = 0;
long patternLength = Convert.ToInt64(pattern.Length);
long[] suffixes = new long[pattern.Length + 1];
suffixes[patternLength - 1] = patternLength;
long g = patternLength - 1;
for (long i = patternLength - 2; i >= 0; --i)
{
if (i > g && suffixes[i + patternLength - 1 - f] = 0 && (pattern[g] == pattern[g + patternLength - 1 - f]))
{
--g;
}
suffixes[i] = f - g;
}
}
return suffixes;
}
private static long[] BuildGoodSuffixShift(byte[] pattern, long[] suff)
{
long patternLength = Convert.ToInt64(pattern.Length);
long[] goodSuffixShift = new long[pattern.Length + 1];
for (long i = 0; i = -1; --i)
{
if (i == -1 || suff[i] == i + 1)
{
for (; j
private static byte[] GetNextChunkForSearch(Stream stream, int chunkIndex, int fileSearchBufferSize, int searchPatternLength)
{
byte[] chunk = null;
long fileStartIndex = Convert.ToInt64(chunkIndex) * Convert.ToInt64(fileSearchBufferSize);
if (fileStartIndex = searchPatternLength)
{
if (numBytesRead
private static List GetMatchIndexes_Internal(byte[] dataToSearch, byte[] searchPattern, long[] goodSuffixShift, long[] badCharacterShift)
{
List matchIndexes = new List();
long patternLength = Convert.ToInt64(searchPattern.Length);
long textLength = Convert.ToInt64(dataToSearch.Length);
long index = 0;
while (index = 0)
{
if (searchPattern[unmatched] != dataToSearch[unmatched + index])
{
index += Math.Max(goodSuffixShift[unmatched], badCharacterShift[dataToSearch[unmatched + index]] - patternLength + 1 + unmatched);
break;
}
unmatched--;
}
if (unmatched < 0)
{
matchIndexes.Add(index);
index += goodSuffixShift[0];
}
}
return matchIndexes;
}
#endregion Helpers
}
}
No comments:
Post a Comment