GTG Lightening Talk

 

Summit Suen

GO
Data
Science

implementation of DataFrame

It is the thing that makes R and Python so important in the field of Data Science.

什麼

是DataFrame

[      ]

 

<int>

<bool>

<float>

<string>

其實

就是

表格啦

github.com/kniren/gota

I/O

df := dataframe.New(
    series.New([]string{"b", "a"}, series.String, "COL.1"),
    series.New([]int{1, 2}, series.Int, "COL.2"),
    series.New([]float64{3.0, 4.0}, series.Float, "COL.3"),
)

I/O

df := dataframe.LoadRecords(
    [][]string{
        []string{"A", "B", "C", "D"},
        []string{"a", "4", "5.1", "true"},
        []string{"k", "5", "7.0", "true"},
        []string{"k", "4", "6.0", "true"},
        []string{"a", "2", "7.1", "false"},
    },
)
fmt.Println(df)

[4x4] DataFrame

    A        B     C        D
 0: a        4     5.100000 true
 1: k        5     7.000000 true
 2: k        4     6.000000 true
 3: a        2     7.100000 false
    <string> <int> <float>  <bool>

I/O

jsonStr := `[{"COL.2":1,"COL.3":3},
             {"COL.1":5,"COL.2":2,"COL.3":2},
             {"COL.1":6,"COL.2":3,"COL.3":1}]`
     df := dataframe.ReadJSON(strings.NewReader(jsonStr))

fmt.Println(df)

[3x3] DataFrame

    COL.1 COL.2 COL.3
 0: NaN   1     3
 1: 5     2     2
 2: 6     3     1
    <int> <int> <int>

Subset

// Row selection
sub := df.Subset([]int{0, 2})

[2x4] DataFrame

    A        B     C        D
 0: a        4     5.100000 true
 2: k        5     7.000000 true
    <string> <int> <float>  <bool>

// Column selection
sel := df.Select([]string{"A", "C"})

[4x4] DataFrame

    A        C       
 0: a        5.100000
 1: k        7.000000
 2: k        6.000000
 3: a        7.100000
    <string> <float> 

Filter

fil := df.Filter(
    dataframe.F{"A", series.Eq, "a"},
    dataframe.F{"B", series.Greater, 4},
) 

[3x4] DataFrame
    A        B     C        D
 0: a        4     5.100000 true
 1: k        5     7.000000 true
 2: a        2     7.100000 false
    <string> <int> <float>  <bool>

fil2 := fil.Filter(
    dataframe.F{"D", series.Eq, true},
)

[2x4] DataFrame
    A        B     C        D
 0: a        4     5.100000 true
 1: k        5     7.000000 true
    <string> <int> <float>  <bool>

Apply

https://github.com/kniren/gota/issues/13

Benchmark

https://github.com/kniren/gota/issues/16

// storing the elements of a Series from []Element to Element

type intElement struct {
    e *int
}

type intElement struct {
    e     int
    nan bool
}

Benchmark

https://github.com/kniren/gota/issues/16

And⋯⋯?

  • Mutate

  • Arrange

  • Join

  • Chaining

  • Split/Group (developing)

  • Combine (developing)

Thanks
for
listening :)

GTG Lightning Talk

By Summit Suen

GTG Lightning Talk

  • 1,087