I have the following file and I am using an iterator block to parse certain re-occuring nodes/parts within the file. I initially used regex to parse the entire file but when certain fields were not present in a node, it would not match. So I am trying to use the yield pattern. The file format is as follows perceeded with the code I am using. All I want from the file are the replicate nodes as an individual part so I can fetch fields within it using a key string and store in collection of objects. I can start parsing where the first replicate occurs but unable to end it where the replicate node ends.
File Format:
X_HEADER
{
DATA_MANAGEMENT_FIELD_2 NA
DATA_MANAGEMENT_FIELD_3 NA
DATA_MANAGEMENT_FIELD_4 NA
SYSTEM_SOFTWARE_VERSION NA
}
Y_HEADER
{
DATA_MANAGEMENT_FIELD_2 NA
DATA_MANAGEMENT_FIELD_3 NA
DATA_MANAGEMENT_FIELD_4 NA
SYSTEM_SOFTWARE_VERSION NA
}
COMPLETION
{
NUMBER 877
VERSION 4
CALIBRATION_VERSION 1
CONFIGURATION_ID 877
}
REPLICATE
{
REPLICATE_ID 1985
ASSAY_NUMBER 656
ASSAY_VERSION 4
ASSAY_STATUS Research
DILUTION_ID 1
}
REPLICATE
{
REPLICATE_ID 1985
ASSAY_NUMBER 656
ASSAY_VERSION 4
ASSAY_STATUS Research
}
Code:
static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
using (var reader = File.OpenText(path))
{
var current = new Dictionary<string, string>();
string line;
while ((line = reader.ReadLine()) != null)
{
if (string.IsNullOrWhiteSpace(line)) continue;
if (line.StartsWith("REPLICATE"))
{
yield return current;
current = new Dictionary<string, string>();
}
else
{
var parts = line.Split('\t');
}
if (current.Count > 0) yield return current;
}
}
}
public static void parseFile(string fileName)
{
foreach (var part in ReadParts(fileName))
{
//part["fIELD1"] will retireve certain values from the REPLICATE PART HERE
}
}
Well, it sounds like you just need to "close" a section when you get a closing brace, and only yield return at that point. For example:
static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
using (var reader = File.OpenText(path))
{
string currentName = null;
IDictionary<string, string> currentMap = null;
while ((line = reader.ReadLine()) != null)
{
if (string.IsNullOrWhiteSpace(line))
{
continue;
}
if (line == "{")
{
if (currentName == null || currentMap != null)
{
throw new BadDataException("Open brace at wrong place");
}
currentMap = new Dictionary<string, string>();
}
else if (line == "}")
{
if (currentName == null || currentMap == null)
{
throw new BadDataException("Closing brace at wrong place");
}
// Isolate the "REPLICATE-only" requirement to a single
// line - if you ever need other bits, you can change this.
if (currentName == "REPLICATE")
{
yield return currentMap;
}
currentName = null;
currentMap = null;
}
else if (!line.StartsWith("\t"))
{
if (currentName != null || currentMap != null)
{
throw new BadDataException("Section name at wrong place");
}
currentName = line;
}
else
{
if (currentName == null || currentMap == null)
{
throw new BadDataException("Name/value pair at wrong place");
}
var parts = line.Substring(1).Split('\t');
if (parts.Length != 2)
{
throw new BadDataException("Invalid name/value pair");
}
currentMap[parts[0]] = parts[1];
}
}
}
}
Now that's a pretty ghastly function, to be honest. I suspect I'd put this in its own class instead (possibly a nested one) to store the state, and make each handler its own method. Heck, this is actually a situation where the state pattern could make sense :)
private IEnumerable<IDictionary<string, string>> ParseFile(System.IO.TextReader reader)
{
string token = reader.ReadLine();
while (token != null)
{
bool isReplicate = token.StartsWith("REPLICATE");
token = reader.ReadLine(); //consume this token to either skip it or parse it
if (isReplicate)
{
yield return ParseBlock(ref token, reader);
}
}
}
private IDictionary<string, string> ParseBlock(ref string token, System.IO.TextReader reader)
{
if (token != "{")
{
throw new Exception("Missing opening brace.");
}
token = reader.ReadLine();
var result = ParseValues(ref token, reader);
if (token != "}")
{
throw new Exception("Missing closing brace.");
}
token = reader.ReadLine();
return result;
}
private IDictionary<string, string> ParseValues(ref string token, System.IO.TextReader reader)
{
IDictionary<string, string> result = new Dictionary<string, string>();
while (token != "}" and token != null)
{
var args = token.Split('\t');
if (args.Length < 2)
{
throw new Exception();
}
result.Add(args[0], args[1]);
token = reader.ReadLine();
}
return result;
}
If you add a yield return current; after your while loop is over, you will get the final dictionary.
I believe it would be better to check for '}' as an end to the current block, and then put the yield return there. although you can't use regex t parse the entire file, you can use regex to search for the key-value pairs within the lines. The following iterator code should work. It will only return dictonaries for REPLICATE blocks.
// Check for lines that are a key-value pair, separated by whitespace.
// Note that value is optional
static string partPattern = #"^(?<Key>\w*)(\s+(?<Value>\.*))?$";
static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
using (var reader = File.OpenText(path))
{
string line;
while ((line = reader.ReadLine()) != null)
{
// Ignore lines that just contain whitespace
if (string.IsNullOrWhiteSpace(line)) continue;
// This is a new replicate block, start a new dictionary
if (line.Trim().CompareTo("REPLICATE") == 0)
{
yield return parseReplicateBlock(reader);
}
}
}
}
private static IDictionary<string, string> parseReplicateBlock(StreamReader reader)
{
// Make sure we have an opening brace
VerifyOpening(reader);
string line;
var currentDictionary = new Dictionary<string, string>();
while ((line = reader.ReadLine()) != null)
{
// Ignore lines that just contain whitespace
if (string.IsNullOrWhiteSpace(line)) continue;
line = line.Trim();
// Since our regex used groupings (?<Key> and ?<Value>),
// we can do a match and check to see if our groupings
// found anything. If they did, extract the key and value.
Match m = Regex.Match(line, partPattern);
if (m.Groups["Key"].Length > 0)
{
currentDictionary.Add(m.Groups["Key"].Value, m.Groups["Value"].Value);
}
else if (line.CompareTo("}") == 0)
{
return currentDictionary;
}
}
// We exited the loop before we found a closing brace, throw an exception
throw new ApplicationException("Missing closing brace");
}
private static void VerifyOpening(StreamReader reader)
{
string line;
while ((line = reader.ReadLine()) != null)
{
// Ignore lines that just contain whitespace
if (string.IsNullOrWhiteSpace(line)) continue;
if (line.Trim().CompareTo("{") == 0)
{
return;
}
else
{
throw new ApplicationException("Missing opening brace");
}
}
throw new ApplicationException("Missing opening brace");
}
Update: I made sure that the regex string includes cases where there is no value. In addition, the group indexes were all changed to use the group name to avoid any issues if the regex string is modified.
Related
I am looking for tips to improve the readability of my code (which is written in C#). My main concern is usage of consecutive if statements and how should I replace them.
My code:
private Dictionary<Guid, CommerceMediaFileAssociation<T>> GetMediaToContentsAssociations<T>(ref bool stopSignaled, CatalogContentBase catalog, CultureInfo culture, Action<string> onStatusChanged = null)
where T : MediaData
{
IEnumerable<ContentReference> descendentReferences = _contentLoader.GetDescendents(catalog.ContentLink);
var associations = new Dictionary<Guid, CommerceMediaFileAssociation<T>>();
if (descendentReferences.Any())
{
var descendentProducts = _contentLoader.GetItems<BaseProduct>(descendentReferences, culture);
foreach (var product in descendentProducts)
{
if (stopSignaled)
{
onStatusChanged?.Invoke($"Reindexing canceled.");
break;
}
if (product is IAssetContainer assetContainer && (assetContainer?.CommerceMediaCollection?.Any() ?? false))
{
foreach (CommerceMedia media in assetContainer.CommerceMediaCollection)
{
PermanentLinkMap mediaLinkMap = _permanentLinkMapper.Find(media.AssetLink);
if ((mediaLinkMap?.Guid != null) && mediaLinkMap.Guid != Guid.Empty)
{
var productInformation = ProductUtilities.GetProductCategoriesAndPriority(product);
if (associations.TryGetValue(mediaLinkMap.Guid, out CommerceMediaFileAssociation<T> commerceMediaFileAssociations))
{
commerceMediaFileAssociations.Products.Add($"{product.ContentGuid.ToString()}||{product.MetaTitle}");
if (productInformation.Categories?.Any() ?? false)
{
foreach (string category in productInformation.Categories)
{
commerceMediaFileAssociations.ProductCategories.Add(category);
}
}
associations[mediaLinkMap.Guid] = commerceMediaFileAssociations;
}
else
{
var commerceMediaFileAssociation = new CommerceMediaFileAssociation<T>();
commerceMediaFileAssociation.Products.Add($"{product.ContentGuid.ToString()}||{product.MetaTitle}");
foreach (string category in productInformation.Categories)
{
commerceMediaFileAssociation.ProductCategories.Add(category);
}
associations.Add(mediaLinkMap.Guid, commerceMediaFileAssociation);
}
associations[mediaLinkMap.Guid].Priority = productInformation.Priority;
}
}
}
if (product is RockstarProduct rockstar)
{
var files = rockstar.Rockstar_Product_Product_Documents.FilteredItems.Select(x => x.GetContent() as IContentMedia) ?? new List<IContentMedia>();
foreach (var file in files)
{
PermanentLinkMap mediaLinkMap = _permanentLinkMapper.Find(file.ContentLink);
if ((mediaLinkMap?.Guid != null) && mediaLinkMap.Guid != Guid.Empty)
{
var productInformation = ProductUtilities.GetProductCategoriesAndPriority(product);
if (associations.TryGetValue(mediaLinkMap.Guid, out CommerceMediaFileAssociation<T> commerceMediaFileAssociations))
{
commerceMediaFileAssociations.Products.Add($"{product.ContentGuid.ToString()}||{product.MetaTitle}");
if (productInformation.Categories?.Any() ?? false)
{
foreach (string category in productInformation.Categories)
{
commerceMediaFileAssociations.ProductCategories.Add(category);
}
}
associations[mediaLinkMap.Guid] = commerceMediaFileAssociations;
}
else
{
var commerceMediaFileAssociation = new CommerceMediaFileAssociation<T>();
commerceMediaFileAssociation.Products.Add($"{product.ContentGuid.ToString()}||{product.MetaTitle}");
foreach (string category in productInformation.Categories)
{
commerceMediaFileAssociation.ProductCategories.Add(category);
}
associations.Add(mediaLinkMap.Guid, commerceMediaFileAssociation);
}
associations[mediaLinkMap.Guid].Priority = productInformation.Priority;
}
}
}
}
}
return associations;
}
How should I change it in order to make it clean, understandable and maintainable? Should I use guard clauses in this case?
You have a lot of code duplications. Some code parts are almost equivalent. It is easy to make them equal. This allows you extract them to another method.
Also, you have if-else statements where the else-part has a lot in common with the if-part where basically only the first statement differs (after having made the almost equal statements equal). You can extract these statements and execute them after the if-else. Then if-part becomes empty. You can invert the if-else and then drop the else part.
There is also no point in testing a collection with Any() before looping through it. If the collection is empty, the loop body will not be executed anyway.
The extracted method:
private void AddCategoriesAndProducts<T>(Dictionary<Guid, CommerceMediaFileAssociation<T>> associations, BaseProduct product, string link) where T : MediaData
{
PermanentLinkMap mediaLinkMap = _permanentLinkMapper.Find(link);
if (mediaLinkMap?.Guid != null && mediaLinkMap.Guid != Guid.Empty) {
var productInformation = ProductUtilities.GetProductCategoriesAndPriority(product);
if (!associations.TryGetValue(mediaLinkMap.Guid, out CommerceMediaFileAssociation<T> commerceMediaFileAssociations)) {
commerceMediaFileAssociations = new CommerceMediaFileAssociation<T>();
}
commerceMediaFileAssociations.Products.Add($"{product.ContentGuid}||{product.MetaTitle}");
if (productInformation.Categories != null) {
foreach (string category in productInformation.Categories) {
commerceMediaFileAssociations.ProductCategories.Add(category);
}
}
associations[mediaLinkMap.Guid] = commerceMediaFileAssociations;
associations[mediaLinkMap.Guid].Priority = productInformation.Priority;
}
}
The simplified method:
private Dictionary<Guid, CommerceMediaFileAssociation<T>> GetMediaToContentsAssociations<T>(ref bool stopSignaled, CatalogContentBase catalog, CultureInfo culture, Action<string> onStatusChanged = null)
where T : MediaData
{
IEnumerable<ContentReference> descendentReferences = _contentLoader.GetDescendents(catalog.ContentLink);
var associations = new Dictionary<Guid, CommerceMediaFileAssociation<T>>();
if (descendentReferences.Any()) {
var descendentProducts = _contentLoader.GetItems<BaseProduct>(descendentReferences, culture);
foreach (var product in descendentProducts) {
if (stopSignaled) {
onStatusChanged?.Invoke($"Reindexing canceled.");
break;
}
if (product is IAssetContainer assetContainer && assetContainer.CommerceMediaCollection != null) {
foreach (CommerceMedia media in assetContainer.CommerceMediaCollection) {
AddCategoriesAndProducts(associations, product, media.AssetLink);
}
}
if (product is RockstarProduct rockstar) {
var files = rockstar.Rockstar_Product_Product_Documents.FilteredItems.Select(x => x.GetContent() as IContentMedia) ?? new List<IContentMedia>();
foreach (var file in files) {
AddCategoriesAndProducts(associations, product, file.ContentLink);
}
}
}
}
return associations;
}
These methods are still relatively complex and could be split into even smaller methods. Methods like AddAssetProducts and AddRockStartProducts.
I'm trying to split very large JSON files into smaller files for a given array. For example:
{
"headerName1": "headerVal1",
"headerName2": "headerVal2",
"headerName3": [{
"element1Name1": "element1Value1"
},
{
"element2Name1": "element2Value1"
},
{
"element3Name1": "element3Value1"
},
{
"element4Name1": "element4Value1"
},
{
"element5Name1": "element5Value1"
},
{
"element6Name1": "element6Value1"
}]
}
...down to { "elementNName1": "elementNValue1" } where N is a large number
The user provides the name which represents the array to be split (in this example "headerName3") and the number of array objects per file, e.g. 1,000,000
This would result in N files each containing the top name:value pairs (headerName1, headerName3) and up to 1,000,000 of the headerName3 objects in each file.
I'm using the excellent Newtonsof JSON.net and understand that I need to do this using a stream.
So far I have looked a reading in JToken objects to establish where the PropertyName == "headerName3" occurs when reading in the tokens but what I would like to do is then read in the entire JSON object for each object in the array and not have to continue parsing JSON into JTokens;
Here's a snippet of the code I am building so far:
using (StreamReader oSR = File.OpenText(strInput))
{
using (var reader = new JsonTextReader(oSR))
{
while (reader.Read())
{
if (reader.TokenType == JsonToken.StartObject)
{
intObjectCount++;
}
else if (reader.TokenType == JsonToken.EndObject)
{
intObjectCount--;
if (intObjectCount == 1)
{
intArrayRecordCount++;
// Here I want to read the entire object for this record into an untyped JSON object
if( intArrayRecordCount % 1000000 == 0)
{
//write these to the split file
}
}
}
}
}
}
I don't know - and in fact, and am not concerned with - the structure of the JSON itself, and the objects can be of varying structures within the array. I am therefore not serializing to classes.
Is this the right approach? Is there a set of methods in the JSON.net library I can easily use to perform such operation?
Any help appreciated.
You can use JsonWriter.WriteToken(JsonReader reader, true) to stream individual array entries and their descendants from a JsonReader to a JsonWriter. You can also use JProperty.Load(JsonReader reader) and JProperty.WriteTo(JsonWriter writer) to read and write entire properties and their descendants.
Using these methods, you can create a state machine that parses the JSON file, iterates through the root object, loads "prefix" and "postfix" properties, splits the array property, and writes the prefix, array slice, and postfix properties out to new file(s).
Here's a prototype implementation that takes a TextReader and a callback function to create sequential output TextWriter objects for the split file:
enum SplitState
{
InPrefix,
InSplitProperty,
InSplitArray,
InPostfix,
}
public static void SplitJson(TextReader textReader, string tokenName, long maxItems, Func<int, TextWriter> createStream, Formatting formatting)
{
List<JProperty> prefixProperties = new List<JProperty>();
List<JProperty> postFixProperties = new List<JProperty>();
List<JsonWriter> writers = new List<JsonWriter>();
SplitState state = SplitState.InPrefix;
long count = 0;
try
{
using (var reader = new JsonTextReader(textReader))
{
bool doRead = true;
while (doRead ? reader.Read() : true)
{
doRead = true;
if (reader.TokenType == JsonToken.Comment || reader.TokenType == JsonToken.None)
continue;
if (reader.Depth == 0)
{
if (reader.TokenType != JsonToken.StartObject && reader.TokenType != JsonToken.EndObject)
throw new JsonException("JSON root container is not an Object");
}
else if (reader.Depth == 1 && reader.TokenType == JsonToken.PropertyName)
{
if ((string)reader.Value == tokenName)
{
state = SplitState.InSplitProperty;
}
else
{
if (state == SplitState.InSplitProperty)
state = SplitState.InPostfix;
var property = JProperty.Load(reader);
doRead = false; // JProperty.Load() will have already advanced the reader.
if (state == SplitState.InPrefix)
{
prefixProperties.Add(property);
}
else
{
postFixProperties.Add(property);
}
}
}
else if (reader.Depth == 1 && reader.TokenType == JsonToken.StartArray && state == SplitState.InSplitProperty)
{
state = SplitState.InSplitArray;
}
else if (reader.Depth == 1 && reader.TokenType == JsonToken.EndArray && state == SplitState.InSplitArray)
{
state = SplitState.InSplitProperty;
}
else if (state == SplitState.InSplitArray && reader.Depth == 2)
{
if (count % maxItems == 0)
{
var writer = new JsonTextWriter(createStream(writers.Count)) { Formatting = formatting };
writers.Add(writer);
writer.WriteStartObject();
foreach (var property in prefixProperties)
property.WriteTo(writer);
writer.WritePropertyName(tokenName);
writer.WriteStartArray();
}
count++;
writers.Last().WriteToken(reader, true);
}
else
{
throw new JsonException("Internal error");
}
}
}
foreach (var writer in writers)
using (writer)
{
writer.WriteEndArray();
foreach (var property in postFixProperties)
property.WriteTo(writer);
writer.WriteEndObject();
}
}
finally
{
// Make sure files are closed in the event of an exception.
foreach (var writer in writers)
using (writer)
{
}
}
}
This method leaves all the files open until the end in case "postfix" properties, appearing after the array property, need to be appended. Be aware that there is a limit of 16384 open files at one time, so if you need to create more split files, this won't work. If postfix properties are never encountered in practice, you can just close each file before opening the next and throw an exception in case any postfix properties are found. Otherwise you may need to parse the large file in two passes or close and reopen the split files to append them.
Here is an example of how to use the method with an in-memory JSON string:
private static void TestSplitJson(string json, string tokenName)
{
var builders = new List<StringBuilder>();
using (var reader = new StringReader(json))
{
SplitJson(reader, tokenName, 2, i => { builders.Add(new StringBuilder()); return new StringWriter(builders.Last()); }, Formatting.Indented);
}
foreach (var s in builders.Select(b => b.ToString()))
{
Console.WriteLine(s);
}
}
Prototype fiddle.
I usually add some strings from a text file into a list or array line by line, although I am now using "#"'s as separators in the text file. How would it be possible to read the two strings "softpedia.com" and "download.com" into a list using the two "#" signs as a breaking point? Baring in mind that there might be more or less strings inbetween the two hashes
e.g.
# Internal Hostnames
softpedia.com
download.com
# External Hostnames
Expected output:
softpedia.com
download.com
class Program
{
static void Main()
{
using (var reader = File.OpenText("test.txt"))
{
foreach (var line in Parse(reader))
{
Console.WriteLine(line);
}
}
}
public static IEnumerable<string> Parse(StreamReader reader)
{
string line;
bool first = false;
while ((line = reader.ReadLine()) != null)
{
if (!line.StartsWith("#"))
{
if (first)
{
yield return line;
}
}
else if (!first)
{
first = true;
}
else
{
yield break;
}
}
}
}
and if you wanted to just get them in a list:
using (var reader = File.OpenText("test.txt"))
{
List<string> hostnames = Parse(reader).ToList();
}
Read it into a buffer and let regex do the work.
string input = #"
# Internal Hostnames
softpedia.com
download.com
# External Hostnames
";
string pattern = #"^(?!#)(?<Text>[^\r\s]+)(?:\s?)";
Regex.Matches(input, pattern, RegexOptions.Multiline)
.OfType<Match>()
.Select (mt => mt.Groups["Text"].Value)
.ToList()
.ForEach( site => Console.WriteLine (site));
/* Outputs
softpedia.com
download.com
*/
It sounds like you want to read all of the lines in between a set of # start lines. If so try the following
List<string> ReadLines(string filePath) {
var list = new List<string>();
var foundStart = false;
foreach (var line in File.ReadAllLines(filePath)) {
if (line.Length > 0 && line[0] == '#') {
if (foundStart) {
return list;
}
foundStart = true;
} else if (foundStart) {
list.Add(line);
}
}
return line;
}
Is there a way to query an XmlSchema or XmlSchemaSet for a list of available tags/attributes at a certain point in the XML? So say my cursor is between <b> and </b> and my schema only allows for a <c/> element there, can I figure that out using anything built in to C#?
<tagset>
<a></a>
<b><!-- CURSOR IS HERE --></b>
</tagset>
There is a way, but the Xml Schema specification is complex so it will take some effort and a few hundred lines of code.
The GetExpectedParticles method of the .NET XmlSchemaValidator class is the key part to a solution. This uses the XmlSchemaSet, passed as an argument, to return a set of XmlSchemaObject instances.
Before you can call this method you need to build a node path to your cursor location which must include ancestor elements and their preceding siblings and also the preceding siblings at the current nesting level. This node path is used to set the context for the schema validator.
After GetExpectedParticles has been called you need to process the particles. For instance, check if each the expected particle is a member of a substitution group, and check whether the expected particle is a restricted simple type that's an enumeration.
It's probably best to separate out code that fetches expected elements and attributes respectively.
The following incomplete code snippet includes the GetExpectedParticles method call, this only caters for element tag content, not attributes:
public static List<XmlSchemaObject> XsdExpectedElements(XmlSchemaSet schemaSet,
List<NodeDescriptor> nodePath)
{
List<XmlSchemaObject> elementNames = new List<XmlSchemaObject>();
NameTable nt = new NameTable();
XmlNamespaceManager manager = new XmlNamespaceManager(nt);
XmlSchemaValidator validator = new XmlSchemaValidator(nt, schemaSet, manager, XmlSchemaValidationFlags.None);
// event handler sets validationErrorFound local field
validator.ValidationEventHandler += new ValidationEventHandler(validator_ValidationEventHandler);
validator.Initialize();
XmlSchemaInfo xsInfo = new XmlSchemaInfo();
int i = 0;
foreach (nodeDescriptor nameUri in nodePath)
{
validator.ValidateElement(nameUri.LocalName, nameUri.NamespaceUri, xsInfo);
if ((i >= siblingPosition && siblingPosition > -1) || nameUri.Closed)
{
validator.SkipToEndElement(null);
}
else
{
validator.ValidateEndOfAttributes(null);
}
i++;
}
XmlSchemaParticle[] parts = validator.GetExpectedParticles();
if (parts.Length == 0)
{
bool hasElements = true;
bool elementClosed = nodePath[nodePath.Count - 1].Closed;
if (elementClosed) // we're outside the element tags
{
hasElements = true;
}
else if (xsInfo.SchemaType is XmlSchemaSimpleType)
{
hasElements = false;
}
else
{
XmlSchemaComplexType xsCt = xsInfo.SchemaType as XmlSchemaComplexType;
XmlSchemaContentType xsContent = (XmlSchemaContentType)xsCt.ContentType;
if (xsContent == XmlSchemaContentType.TextOnly)
{
hasElements = false;
}
}
if (!hasElements)
{
expectedType = XmlEditor.expectedListType.elementValue;
if (xsInfo.SchemaElement != null)
{
elementNames.Add(xsInfo.SchemaElement);
}
}
return elementNames;
}
foreach (XmlSchemaObject xso in parts)
{
if (xso is XmlSchemaElement)
{
XmlSchemaElement xse = (XmlSchemaElement)xso;
if (subGroupList.ContainsKey(xse.QualifiedName))
{
List<XmlSchemaElement> xses = subGroupList[xse.QualifiedName];
foreach (XmlSchemaElement xseInstance in xses)
{
elementNames.Add(xseInstance);
}
}
else
{
elementNames.Add(xse);
}
}
else if (xso is XmlSchemaAny)
{
XmlSchemaAny xsa = (XmlSchemaAny)xso;
foreach (XmlSchema xs in schemaSet.Schemas())
{
if (xs.TargetNamespace == xsa.Namespace)
{
foreach (XmlSchemaElement xseAny in xs.Elements)
{
elementNames.Add(xseAny);
}
}
}
}
}
}
The following (incomplete) code snippet shows how to get expected enumerated values from a particle:
private List<string> ExpectedEnumValues(XmlSchemaObject xsso)
{
XmlSchemaSimpleType xst = null;
XmlSchemaComplexType xsCt = null;
List<string> values = new List<string>();
if (xsso == null)
{
return values;
}
if (xsso is XmlSchemaAttribute)
{
XmlSchemaAttribute xsa = (XmlSchemaAttribute)xsso;
xst = xsa.AttributeSchemaType;
}
else
{
XmlSchemaElement xse = (XmlSchemaElement)xsso;
XmlSchemaType gxst = xse.ElementSchemaType;
if (gxst is XmlSchemaSimpleType)
{
xst = (XmlSchemaSimpleType)gxst;
}
else if (gxst is XmlSchemaComplexType)
{
xsCt = (XmlSchemaComplexType)gxst;
}
else
{
return values;
}
}
if(xst != null)
{
if (xst.TypeCode == XmlTypeCode.Boolean)
{
values.Add("true");
values.Add("false");
}
else
{
ProcessXmlSimpleType(xst, values);
}
}
else if (xsCt != null)
{
XmlSchemaContentType xsContent = (XmlSchemaContentType) xsCt.ContentType;
XmlSchemaContentModel xsModel = (XmlSchemaContentModel)xsCt.ContentModel;
if (xsModel is XmlSchemaSimpleContent)
{
XmlSchemaSimpleContent xsSC = (XmlSchemaSimpleContent)xsModel;
XmlSchemaContent xsRE = xsSC.Content;
if (xsRE != null)
{
if (xsRE is XmlSchemaSimpleContentRestriction)
{
XmlSchemaSimpleContentRestriction xsCCR = (XmlSchemaSimpleContentRestriction)xsRE;
foreach (XmlSchemaObject xso in xsCCR.Facets)
{
if (xso is XmlSchemaEnumerationFacet)
{
XmlSchemaEnumerationFacet xsef = (XmlSchemaEnumerationFacet)xso;
values.Add(xsef.Value);
}
}
}
}
}
else
{
XmlSchemaComplexContent xsCC = (XmlSchemaComplexContent)xsModel;
XmlSchemaContent xsRE = xsCC.Content;
if (xsRE != null)
{
if (xsRE is XmlSchemaComplexContentRestriction)
{
XmlSchemaComplexContentRestriction xsR = (XmlSchemaComplexContentRestriction)xsRE;
}
else if (xsRE is XmlSchemaComplexContentExtension)
{
XmlSchemaComplexContentExtension xsE = (XmlSchemaComplexContentExtension)xsRE;
}
}
}
}
return values;
}
And to process a simple type:
private static void ProcessXmlSimpleType(XmlSchemaSimpleType xst, List<string> values)
{
if (xst == null)
{
return;
}
XmlSchemaSimpleTypeContent xsstc = xst.Content;
if (xsstc is XmlSchemaSimpleTypeRestriction)
{
XmlSchemaSimpleTypeRestriction xsr = (XmlSchemaSimpleTypeRestriction)xsstc;
XmlSchemaObjectCollection xsoc = xsr.Facets;
XmlSchemaSimpleType bastTypeOfRestiction = xsr.BaseType;
foreach (XmlSchemaObject xso in xsoc)
{
if (xso is XmlSchemaEnumerationFacet)
{
XmlSchemaEnumerationFacet xsef = (XmlSchemaEnumerationFacet)xso;
values.Add(xsef.Value);
}
}
}
else if (xsstc is XmlSchemaSimpleTypeList)
{
XmlSchemaSimpleTypeList xsstL = (XmlSchemaSimpleTypeList)xsstc;
XmlSchemaSimpleType xstL = xsstL.BaseItemType;
ProcessXmlSimpleType(xstL, values); // recursive
}
else if (xsstc is XmlSchemaSimpleTypeUnion)
{
XmlSchemaSimpleTypeUnion xstU = (XmlSchemaSimpleTypeUnion)xsstc;
XmlSchemaSimpleType[] xsstArray = xstU.BaseMemberTypes;
foreach (XmlSchemaSimpleType xsstA in xsstArray)
{
ProcessXmlSimpleType(xsstA, values); // recursive
}
}
}
The above code snippets probably address 20% of what's needed, but hopefully give you some idea of what you will be dealing with. .NET provides a very powerful set of classes for analysing the Schema Object Model, but you will need detailed knowledge of the XML Schema specification to get usable results.
XML editors should still provide auto-completion help when the XML is not valid, this adds an extra dimension to the problem because there may be ambiguities if there's limited validation context and the schema design is more 'russian-doll' than 'salami sliced'.
Summary
Getting a list of expected XML schema particles for a given context within an XML instance using .NET is possible but relatively complex. In view of this, it would be worthwhile to first check if libraries from existing .NET XML editors provide the functionality you need.
For a working implementation under LGPL have a look at SharpDevelops XmlEditor part.
You get the code completion for xml in one dll, namely the XmlEditor.dll in the AddIns/DisplayBindings directory.
Is there a way to read ahead one line to test if the next line contains specific tag data?
I'm dealing with a format that has a start tag but no end tag.
I would like to read a line add it to a structure then test the line below to make sure it not a new "node" and if it isn't keep adding if it is close off that struct and make a new one
the only solution i can think of is to have two stream readers going at the same time kinda suffling there way along lock step but that seems wastefull (if it will even work)
i need something like peek but peekline
The problem is the underlying stream may not even be seekable. If you take a look at the stream reader implementation it uses a buffer so it can implement TextReader.Peek() even if the stream is not seekable.
You could write a simple adapter that reads the next line and buffers it internally, something like this:
public class PeekableStreamReaderAdapter
{
private StreamReader Underlying;
private Queue<string> BufferedLines;
public PeekableStreamReaderAdapter(StreamReader underlying)
{
Underlying = underlying;
BufferedLines = new Queue<string>();
}
public string PeekLine()
{
string line = Underlying.ReadLine();
if (line == null)
return null;
BufferedLines.Enqueue(line);
return line;
}
public string ReadLine()
{
if (BufferedLines.Count > 0)
return BufferedLines.Dequeue();
return Underlying.ReadLine();
}
}
You could store the position accessing StreamReader.BaseStream.Position, then read the line next line, do your test, then seek to the position before you read the line:
// Peek at the next line
long peekPos = reader.BaseStream.Position;
string line = reader.ReadLine();
if (line.StartsWith("<tag start>"))
{
// This is a new tag, so we reset the position
reader.BaseStream.Seek(pos);
}
else
{
// This is part of the same node.
}
This is a lot of seeking and re-reading the same lines. Using some logic, you may be able to avoid this altogether - for instance, when you see a new tag start, close out the existing structure and start a new one - here's a basic algorithm:
SomeStructure myStructure = null;
while (!reader.EndOfStream)
{
string currentLine = reader.ReadLine();
if (currentLine.StartsWith("<tag start>"))
{
// Close out existing structure.
if (myStructure != null)
{
// Close out the existing structure.
}
// Create a new structure and add this line.
myStructure = new Structure();
// Append to myStructure.
}
else
{
// Add to the existing structure.
if (myStructure != null)
{
// Append to existing myStructure
}
else
{
// This means the first line was not part of a structure.
// Either handle this case, or throw an exception.
}
}
}
Why the difficulty? Return the next line, regardless. Check if it is a new node, if not, add it to the struct. If it is, create a new struct.
// Not exactly C# but close enough
Collection structs = new Collection();
Struct struct;
while ((line = readline()) != null)) {
if (IsNode(line)) {
if (struct != null) structs.add(struct);
struct = new Struct();
continue;
}
// Whatever processing you need to do
struct.addLine(line);
}
structs.add(struct); // Add the last one to the collection
// Use your structures here
foreach s in structs {
}
Here is what i go so far. I went more of the split route than the streamreader line by line route.
I'm sure there are a few places that are dieing to be more elegant but for right now it seems to be working.
Please let me know what you think
struct INDI
{
public string ID;
public string Name;
public string Sex;
public string BirthDay;
public bool Dead;
}
struct FAM
{
public string FamID;
public string type;
public string IndiID;
}
List<INDI> Individuals = new List<INDI>();
List<FAM> Family = new List<FAM>();
private void button1_Click(object sender, EventArgs e)
{
string path = #"C:\mostrecent.ged";
ParseGedcom(path);
}
private void ParseGedcom(string path)
{
//Open path to GED file
StreamReader SR = new StreamReader(path);
//Read entire block and then plit on 0 # for individuals and familys (no other info is needed for this instance)
string[] Holder = SR.ReadToEnd().Replace("0 #", "\u0646").Split('\u0646');
//For each new cell in the holder array look for Individuals and familys
foreach (string Node in Holder)
{
//Sub Split the string on the returns to get a true block of info
string[] SubNode = Node.Replace("\r\n", "\r").Split('\r');
//If a individual is found
if (SubNode[0].Contains("INDI"))
{
//Create new Structure
INDI I = new INDI();
//Add the ID number and remove extra formating
I.ID = SubNode[0].Replace("#", "").Replace(" INDI", "").Trim();
//Find the name remove extra formating for last name
I.Name = SubNode[FindIndexinArray(SubNode, "NAME")].Replace("1 NAME", "").Replace("/", "").Trim();
//Find Sex and remove extra formating
I.Sex = SubNode[FindIndexinArray(SubNode, "SEX")].Replace("1 SEX ", "").Trim();
//Deterine if there is a brithday -1 means no
if (FindIndexinArray(SubNode, "1 BIRT ") != -1)
{
// add birthday to Struct
I.BirthDay = SubNode[FindIndexinArray(SubNode, "1 BIRT ") + 1].Replace("2 DATE ", "").Trim();
}
// deterimin if there is a death tag will return -1 if not found
if (FindIndexinArray(SubNode, "1 DEAT ") != -1)
{
//convert Y or N to true or false ( defaults to False so no need to change unless Y is found.
if (SubNode[FindIndexinArray(SubNode, "1 DEAT ")].Replace("1 DEAT ", "").Trim() == "Y")
{
//set death
I.Dead = true;
}
}
//add the Struct to the list for later use
Individuals.Add(I);
}
// Start Family section
else if (SubNode[0].Contains("FAM"))
{
//grab Fam id from node early on to keep from doing it over and over
string FamID = SubNode[0].Replace("# FAM", "");
// Multiple children can exist for each family so this section had to be a bit more dynaimic
// Look at each line of node
foreach (string Line in SubNode)
{
// If node is HUSB
if (Line.Contains("1 HUSB "))
{
FAM F = new FAM();
F.FamID = FamID;
F.type = "PAR";
F.IndiID = Line.Replace("1 HUSB ", "").Replace("#","").Trim();
Family.Add(F);
}
//If node for Wife
else if (Line.Contains("1 WIFE "))
{
FAM F = new FAM();
F.FamID = FamID;
F.type = "PAR";
F.IndiID = Line.Replace("1 WIFE ", "").Replace("#", "").Trim();
Family.Add(F);
}
//if node for multi children
else if (Line.Contains("1 CHIL "))
{
FAM F = new FAM();
F.FamID = FamID;
F.type = "CHIL";
F.IndiID = Line.Replace("1 CHIL ", "").Replace("#", "");
Family.Add(F);
}
}
}
}
}
private int FindIndexinArray(string[] Arr, string search)
{
int Val = -1;
for (int i = 0; i < Arr.Length; i++)
{
if (Arr[i].Contains(search))
{
Val = i;
}
}
return Val;
}