I am using Lucene.Net + custom crawler + Ifilter so that I can index data inside blob.
foreach (var item in containerList)
{
CloudBlobContainer container = BlobClient.GetContainerReference(item.Name);
if (container.Name != "indexes")
{
IEnumerable<IListBlobItem> blobs = container.ListBlobs();
foreach (CloudBlob blob in blobs)
{
CloudBlobContainer blobContainer = blob.Container;
CloudBlob blobToDownload = blobContainer.GetBlobReference(blob.Name);
blob.DownloadToFile(path+blob.Name);
indexer.IndexBlobData(path,blob);
System.IO.File.Delete(path+blob.Name);
}
}
}
/*Code for crawling which downloads file Locally on azure instance storage*/
The below code is indexer function which uses IFilter
public bool IndexBlobData(string path, CloudBlob blob)
{
Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
try
{
TextReader reader = new FilterReader(path + blob.Name);
doc.Add(new Lucene.Net.Documents.Field("url", blob.Uri.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));
doc.Add(new Lucene.Net.Documents.Field("content", reader.ReadToEnd().ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));
indexWriter.AddDocument(doc);
reader.Close();
return true;
}
catch (Exception e)
{
return false;
}
}
Now my issue is I don't want to DOWNLOAD file on instance storage.. I directly want to pass the File to FilterReader. But it takes "Physical" path, passing http address doesn't work. Can anybody suggest any other workaround? I don't want to download same file again from blob and then index it, instead i will prefer download and keep it in main memory and directly use index filter.
I am using IFilter from here