2

I am trying to read data from multiple csv files by using Tensorflow's Dataset API.

The following version of the code works just fine:

record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.]]

def decode_csv(line):
   col1, col2, col3, col4, col5, col6, col7 = tf.decode_csv(line, record_defaults)
   features = tf.stack([col2, col3, col4, col5, col6])
   labels = tf.stack([col7])
   return features, labels

filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()

But I would like to make it more dynamic because # columns ( # features) may change in different projects. But when I change the code as following, it just doesn't work. Spending some significant number of hours on the problem did not help either..

record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.]]
def decode_csv(line):
   csv_columns = tf.decode_csv(line, record_defaults)
   labels = csv_columns[-1]    # last column is the label
   del csv_columns[-1]        # delete the last column
   del csv_columns[0]       # delete the first column bcz not a feature
   features = csv_columns
   return features, labels

filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()

I get the following error when I run the second version above.. Maybe a more experienced person sees the problem at once here..?

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-21-92ea8cc44da0> in <module>()
     18 filenames = tf.placeholder(tf.string, shape=[None])
     19 dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
---> 20 dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
     21 dataset5 = dataset5.shuffle(buffer_size=1000)
     22 dataset5 = dataset5.batch(7)

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in flat_map(self, map_func)
    799       Dataset: A `Dataset`.
    800     """
--> 801     return FlatMapDataset(self, map_func)
    802 
    803   def interleave(self, map_func, cycle_length, block_length=1):

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, map_func)
   1676 
   1677     self._map_func = tf_map_func
-> 1678     self._map_func.add_to_graph(ops.get_default_graph())
   1679 
   1680   def _as_variant_tensor(self):

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in add_to_graph(self, g)
    484   def add_to_graph(self, g):
    485     """Adds this function into the graph g."""
--> 486     self._create_definition_if_needed()
    487 
    488     # Adds this function into 'g'.

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed(self)
    319     """Creates the function definition if it's not created yet."""
    320     with context.graph_mode():
--> 321       self._create_definition_if_needed_impl()
    322 
    323   def _create_definition_if_needed_impl(self):

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed_impl(self)
    336       # Call func and gather the output tensors.
    337       with vs.variable_scope("", custom_getter=temp_graph.getvar):
--> 338         outputs = self._func(*inputs)
    339 
    340       # There is no way of distinguishing between a function not returning

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in tf_map_func(*args)
   1664         dataset = map_func(*nested_args)
   1665       else:
-> 1666         dataset = map_func(nested_args)
   1667 
   1668       if not isinstance(dataset, Dataset):

<ipython-input-21-92ea8cc44da0> in <lambda>(filename)
     18 filenames = tf.placeholder(tf.string, shape=[None])
     19 dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
---> 20 dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
     21 dataset5 = dataset5.shuffle(buffer_size=1000)
     22 dataset5 = dataset5.batch(7)

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in map(self, map_func, num_parallel_calls)
    784     """
    785     if num_parallel_calls is None:
--> 786       return MapDataset(self, map_func)
    787     else:
    788       return ParallelMapDataset(self, map_func, num_parallel_calls)

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, map_func)
   1587 
   1588     self._map_func = tf_map_func
-> 1589     self._map_func.add_to_graph(ops.get_default_graph())
   1590 
   1591   def _as_variant_tensor(self):

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in add_to_graph(self, g)
    484   def add_to_graph(self, g):
    485     """Adds this function into the graph g."""
--> 486     self._create_definition_if_needed()
    487 
    488     # Adds this function into 'g'.

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed(self)
    319     """Creates the function definition if it's not created yet."""
    320     with context.graph_mode():
--> 321       self._create_definition_if_needed_impl()
    322 
    323   def _create_definition_if_needed_impl(self):

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/function.py in _create_definition_if_needed_impl(self)
    336       # Call func and gather the output tensors.
    337       with vs.variable_scope("", custom_getter=temp_graph.getvar):
--> 338         outputs = self._func(*inputs)
    339 
    340       # There is no way of distinguishing between a function not returning

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in tf_map_func(*args)
   1575       self._output_classes = sparse.get_classes(ret)
   1576       self._output_shapes = nest.pack_sequence_as(
-> 1577           ret, [t.get_shape() for t in nest.flatten(ret)])
   1578       self._output_types = nest.pack_sequence_as(
   1579           ret, [t.dtype for t in nest.flatten(ret)])

~/.local/lib/python3.5/site-packages/tensorflow/python/data/ops/dataset_ops.py in <listcomp>(.0)
   1575       self._output_classes = sparse.get_classes(ret)
   1576       self._output_shapes = nest.pack_sequence_as(
-> 1577           ret, [t.get_shape() for t in nest.flatten(ret)])
   1578       self._output_types = nest.pack_sequence_as(
   1579           ret, [t.dtype for t in nest.flatten(ret)])

AttributeError: 'list' object has no attribute 'get_shape'

ADDENDUM:

The following works as well.

feature_names = ['f0','f1','f2','f3','f4','f5']
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.]]

def decode_csv(line):
   parsed_line = tf.decode_csv(line, record_defaults) # => tensor
   label =  parsed_line[-1]
   del parsed_line[-1]
   features = parsed_line
   d = dict(zip(feature_names,features)),label
   return d

filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()

But now the decode_csv function is returning a dictionary of (feature_name,feature_value) pairs. Why would someone like to return a dict from this function? Doesn't it make it very difficult to vectorize calculations like forward propagation, etc?

1 Answer 1

1

Solved. Below is the working version. I am not copying the entire thing to save some space. In the excel file, the first column is not a feature, just a training example ID. And the last column is the label only. Stacking the features with the tf.stack(...) function solved the problem.

feature_names = ['f1','f2','f3','f4','f5']
record_defaults = [[""], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]


def decode_csv(line):
   parsed_line = tf.decode_csv(line, record_defaults)
   label =  parsed_line[-1]
   del parsed_line[-1]
   del parsed_line[0]
   features = tf.stack(parsed_line)    # ADDED LINE
   d = features, label
   return d

filenames = tf.placeholder(tf.string, shape=[None])
dataset5 = tf.data.Dataset.from_tensor_slices(filenames)
dataset5 = dataset5.flat_map(lambda filename: tf.data.TextLineDataset(filename).skip(1).map(decode_csv))
dataset5 = dataset5.shuffle(buffer_size=1000)
dataset5 = dataset5.batch(7)
iterator5 = dataset5.make_initializable_iterator()
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.