3

I have a csv file:

Index,X1,X2,X3,X4,X5,Y
1,-1.608052,-0.377992,1.204209,1.313808,1.218265,1
2,0.393766,0.630685,-1.222062,0.090558,0.015893,0
3,-0.466243,0.276972,2.519047,0.673745,0.16729,1
4,1.47121,-0.046791,-0.303291,-0.365437,1.989287,0
5,-1.672906,1.25588,-0.355706,0.123143,-2.241941,1

I want to create a classification system program and the Data is in the second row. I'm trying to get the data from the second row. I tried with next(list) like this:

def load_DataTrain(filename):
    try:
        with open(filename, newline='') as iFile:
            return list(reader(iFile, delimiter=','))
            next(list)
    except FileNotFoundError as e:
        raise e

But it doesn't work and i get an error because the program read from the first row. I didnt use pandas or csv.reader to read my csv. This is the code that i get from Divyesh GitHub:

from csv import reader
from sys import exit
from math import sqrt
from operator import itemgetter

def load_DataTrain(filename):
    try:
        with open(filename) as iFile:
            return list(reader(iFile, delimiter=','))
            next(list)
    except FileNotFoundError as e:
        raise e

def convert_to_float(DataTrain, mode):
    new_set = []
    try:
        if mode == 'training':
            for data in DataTrain:
                new_set.append([float(x) for x in data[:len(data)-1]] + [data[len(data)-1]])

        elif mode == 'test':
            for data in DataTrain:
                new_set.append([float(x) for x in data])

        else:
            print('Invalid mode, program will exit.')
            exit()

        return new_set

    except ValueError as v:
        print(v)
        print('Invalid data set format, program will exit.')
        exit()


def get_classes(training_set):
    return list(set([c[-1] for c in training_set]))


def find_neighbors(distances, k):
    return distances[0:k]


def find_response(neighbors, classes):
    votes = [0] * len(classes)

    for instance in neighbors:
        for ctr, c in enumerate(classes):
            if instance[-2] == c:
                votes[ctr] += 1

    return max(enumerate(votes), key=itemgetter(1))


def knn(training_set, test_set, k):
    distances = []
    dist = 0
    limit = len(training_set[0]) - 1

    # generate response classes from training data
    classes = get_classes(training_set)

    try:
        for test_instance in test_set:
            for row in training_set:
                for x, y in zip(row[:limit], test_instance):
                    dist += (x-y) * (x-y)
                distances.append(row + [sqrt(dist)])
                dist = 0

            distances.sort(key=itemgetter(len(distances[0])-1))

            # find k nearest neighbors
            neighbors = find_neighbors(distances, k)

            # get the class with maximum votes
            index, value = find_response(neighbors, classes)

            # Display prediction
            print('The predicted class for sample ' + str(test_instance) + ' is : ' + classes[index])
            print('Number of votes : ' + str(value) + ' out of ' + str(k))

            # empty the distance list
            distances.clear()

    except Exception as e:
        print(e)


def main():
    try:
        # get value of k
        k = int(input('Enter the value of k : '))

        # load the training and test data set
        training_file = input('Enter name of training data file : ')
        test_file = input('Enter name of test data file : ')
        training_set = convert_to_float(load_DataTrain(training_file), 'training')
        test_set = convert_to_float(load_DataTrain(test_file), 'test')

        if not training_set:
            print('Empty training set')

        elif not test_set:
            print('Empty test set')

        elif k > len(training_set):
            print('Expected number of neighbors is higher than number of training data instances')

        else:
            knn(training_set, test_set, k)

    except ValueError as v:
        print(v)

    except FileNotFoundError:
        print('File not found')


if __name__ == '__main__':
    main()

And the result is:

could not convert string to float: 'Index'

What should i supposed to do to read from the second row in csv file?

1
  • You can also use pandas library. It will a lot easier working with CSV Commented Dec 1, 2018 at 14:53

4 Answers 4

3

Minor change in your function.

If you want to return only 2nd row then you can replace [1:] to [1] in the below code.

from csv import reader
def load_DataTrain(filename):
    try:
        with open(filename, newline='') as iris:
            # returning from 2nd row
            return list(reader(iris, delimiter=','))[1:]
    except FileNotFoundError as e:
        raise e
load_DataTrain("file.csv")

Output:

[['1', '-1.608052', '-0.377992', '1.204209', '1.313808', '1.218265', '1'],
 ['2', '0.393766', '0.630685', '-1.222062', '0.090558', '0.015893', '0'],
 ['3', '-0.466243', '0.276972', '2.519047', '0.673745', '0.16729', '1'],
 ['4', '1.47121', '-0.046791', '-0.303291', '-0.365437', '1.989287', '0'],
 ['5', '-1.672906', '1.25588', '-0.355706', '0.123143', '-2.241941', '1']]

An alternative using pandas

Change df.values.tolist() with df.iloc[0].values.tolist() for returning only 2nd row.

import pandas as pd
df = pd.read_csv("dummy.csv")
pprint(df.values.tolist())

Output:

[[1.0, -1.608052, -0.377992, 1.204209, 1.313808, 1.218265, 1.0],
 [2.0, 0.393766, 0.630685, -1.222062, 0.090558, 0.015893, 0.0],
 [3.0,
  -0.466243,
  0.276972,
  2.519047,
  0.6737449999999999,
  0.16729000000000002,
  1.0],
 [4.0,
  1.4712100000000001,
  -0.046791,
  -0.303291,
  -0.365437,
  1.9892869999999998,
  0.0],
 [5.0,
  -1.6729060000000002,
  1.2558799999999999,
  -0.355706,
  0.123143,
  -2.241941,
  1.0]]
Sign up to request clarification or add additional context in comments.

Comments

3
def returnSecondRow(delimit):
    i=0
    with open(filename) as fh:
        reader = csv.reader(fh, delimiter=delimit)
        for row in reader:
            if i==1:
                # do nothing
                return row
            i += 1

1 Comment

moreover you can just break out of the look after 2nd row. because the data is in second row.
3

Since you‘re using the iris dataset in your example, i guess you‘re venturing into machine learning? If that‘s the case, I think it would be wiser to use pandas to read and process your .csv file.

import pandas as pd

df = pd.read_csv(filename)
row_two = df.iloc[1]

Comments

1

Using base Python (no imports)

f = open("Data.csv","r")
head = f.readline()
print(head)
for line in f:
    line = line.strip('\n')
    line = line.split(",")
    line =[float(i) for i in line]
    line[0] = int(line[0])
    line[-1] = int(line[-1])
    print(line, "and do something")

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.