From 7899f3f5095527b220e08b2d51ba520e658609dc Mon Sep 17 00:00:00 2001 From: "Luke I. Wilson" Date: Sun, 14 May 2023 14:46:24 -0500 Subject: [PATCH] Added Series statistical functions --- data.go | 377 +++++++++++++++++++++++++++++++++++++++++++++++---- data_test.go | 67 +++++++-- 2 files changed, 404 insertions(+), 40 deletions(-) diff --git a/data.go b/data.go index 9347708..cc2afe6 100644 --- a/data.go +++ b/data.go @@ -3,31 +3,44 @@ package autotrader import ( "encoding/csv" "errors" + "fmt" "io" + "math" "os" "strconv" "time" df "github.com/rocketlaunchr/dataframe-go" + "golang.org/x/exp/slices" ) +// EasyIndex returns an index to the `n` -length object that allows for negative indexing. For example, EasyIndex(-1, 5) returns 4. This is similar to Python's negative indexing. The return value may be less than zero if (-i) > n. +func EasyIndex(i, n int) int { + if i < 0 { + return n + i + } + return i +} + type Series interface { Copy() Series Len() int + + // Statistical functions. + Rolling(period int) *RollingSeries + + // Data access functions. + Value(i int) interface{} + Float(i int) float64 + Int(i int) int64 + String(i int) string + Time(i int) time.Time } type Frame interface { Copy() Frame Len() int - // Comparison functions. - Equal(other Frame) bool - NotEqual(other Frame) bool - Less(other Frame) bool - LessEqual(other Frame) bool - Greater(other Frame) bool - GreaterEqual(other Frame) bool - // Easy access functions. Date(i int) time.Time Open(i int) float64 @@ -43,27 +56,212 @@ type Frame interface { Volumes() Series // Custom data columns + Series(name string) Series Value(column string, i int) interface{} Float(column string, i int) float64 - Int(column string, i int) int + Int(column string, i int) int64 String(column string, i int) string // Time returns the value of the column at index i. The first value is at index 0. A negative value for i (-n) can be used to get n values from the latest, like Python's negative indexing. If i is out of bounds, 0 is returned. Time(column string, i int) time.Time } +// AppliedSeries is like Series, but it applies a function to each row of data before returning it. +type AppliedSeries struct { + Series + apply func(i int, val interface{}) interface{} +} + +func (s *AppliedSeries) Value(i int) interface{} { + return s.apply(EasyIndex(i, s.Len()), s.Series.Value(i)) +} + +func NewAppliedSeries(s Series, apply func(i int, val interface{}) interface{}) *AppliedSeries { + return &AppliedSeries{ + Series: s, + apply: apply, + } +} + +type RollingSeries struct { + Series + period int +} + +func (s *RollingSeries) Mean() *AppliedSeries { + return &AppliedSeries{ + Series: s, + apply: func(_ int, v interface{}) interface{} { + switch v := v.(type) { + case []interface{}: + if len(v) == 0 { + return nil + } + switch v[0].(type) { + case float64: + var sum float64 + for _, v := range v { + sum += v.(float64) + } + return sum / float64(len(v)) + case int64: + var sum int64 + for _, v := range v { + sum += v.(int64) + } + return sum / int64(len(v)) + default: + return v[len(v)-1] // Do nothing + } + default: + panic(fmt.Sprintf("expected a slice of values, got %t", v)) + } + }, + } +} + +func (s *RollingSeries) EMA() *AppliedSeries { + return &AppliedSeries{ + Series: s, + apply: func(i int, v interface{}) interface{} { + switch v := v.(type) { + case []interface{}: + if len(v) == 0 { + return nil + } + switch v[0].(type) { + case float64: + ema := v[0].(float64) + for _, v := range v[1:] { + ema += (v.(float64) - ema) * 2 / (float64(s.period) + 1) + } + return ema + case int64: + ema := v[0].(int64) + for _, v := range v[1:] { + ema += (v.(int64) - ema) * 2 / (int64(s.period) + 1) + } + return ema + default: // string, time.Time + return v[len(v)-1] // Do nothing + } + default: + panic(fmt.Sprintf("expected a slice of values, got %t", v)) + } + }, + } +} + +func (s *RollingSeries) Median() *AppliedSeries { + return &AppliedSeries{ + Series: s, + apply: func(_ int, v interface{}) interface{} { + switch v := v.(type) { + case []interface{}: + if len(v) == 0 { + return nil + } + switch v[0].(type) { + case float64: + if len(v) == 0 { + return float64(0) + } + slices.SortFunc(v, func(a, b interface{}) bool { + x, y := a.(float64), b.(float64) + return x < y || (math.IsNaN(x) && !math.IsNaN(y)) + }) + if len(v)%2 == 0 { + return (v[len(v)/2-1].(float64) + v[len(v)/2].(float64)) / 2 + } + return v[len(v)/2] + case int64: + if len(v) == 0 { + return int64(0) + } + slices.SortFunc(v, func(a, b interface{}) bool { + x, y := a.(int64), b.(int64) + return x < y + }) + if len(v)%2 == 0 { + return (v[len(v)/2-1].(int64) + v[len(v)/2].(int64)) / 2 + } + return v[len(v)/2] + default: // string, time.Time + return v[len(v)-1] // Do nothing + } + default: + panic(fmt.Sprintf("expected a slice of values, got %t", v)) + } + }, + } +} + +func (s *RollingSeries) StdDev() *AppliedSeries { + return &AppliedSeries{ + Series: s, + apply: func(i int, v interface{}) interface{} { + switch v := v.(type) { + case []interface{}: + if len(v) == 0 { + return nil + } + switch v[0].(type) { + case float64: + mean := s.Mean().Value(i).(float64) // Take the mean of the last period values for the current index + var sum float64 + for _, v := range v { + sum += (v.(float64) - mean) * (v.(float64) - mean) + } + return math.Sqrt(sum / float64(len(v))) + case int64: + mean := s.Mean().Value(i).(int64) + var sum int64 + for _, v := range v { + sum += (v.(int64) - mean) * (v.(int64) - mean) + } + return int64(math.Sqrt(float64(sum) / float64(len(v)))) + default: // A slice of something else, just return the last value + return v[len(v)-1] // Do nothing + } + default: + panic(fmt.Sprintf("expected a slice of values, got %t", v)) + } + }, + } +} + +// Value returns []interface{} up to `period` long. The last item in the slice is the item at i. If i is out of bounds, nil is returned. +func (s *RollingSeries) Value(i int) interface{} { + items := make([]interface{}, 0, s.period) + i = EasyIndex(i, s.Len()) + if i < 0 || i >= s.Len() { + return items + } + for j := i; j > i-s.period && j >= 0; j-- { + // items = append(items, s.Series.Value(j)) + items = slices.Insert(items, 0, s.Series.Value(j)) + } + return items +} + +// DataSeries is a Series that wraps a column of data. The data can be of the following types: float64, int64, string, or time.Time. +type DataSeries struct { + data df.Series +} + type DataFrame struct { - *df.DataFrame // DataFrame with a Date, Open, High, Low, Close, and Volume column. + data *df.DataFrame // DataFrame with a Date, Open, High, Low, Close, and Volume column. } func (o *DataFrame) Copy() *DataFrame { - return &DataFrame{o.DataFrame.Copy()} + return &DataFrame{o.data.Copy()} } +// Len returns the number of rows in the DataFrame or 0 if the DataFrame is nil. func (o *DataFrame) Len() int { - if o.DataFrame == nil { + if o.data == nil { return 0 } - return o.NRows() + return o.data.NRows() } // Date returns the value of the Date column at index i. The first value is at index 0. A negative value for i (-n) can be used to get n values from the latest, like Python's negative indexing. If i is out of bounds, 0 is returned. @@ -102,21 +300,59 @@ func (o *DataFrame) Volume(i int) float64 { return o.Float("Volume", i) } -// Value returns the value of the column at index i. The first value is at index 0. A negative value for i (-n) can be used to get n values from the latest, like Python's negative indexing. If i is out of bounds, nil is returned. -func (o *DataFrame) Value(column string, i int) interface{} { - colIdx, err := o.DataFrame.NameToColumn(column) +// Dates returns a Series of all the dates in the DataFrame. +func (o *DataFrame) Dates() Series { + return o.Series("Date") +} + +// Opens returns a Series of all the open prices in the DataFrame. +func (o *DataFrame) Opens() Series { + return o.Series("Open") +} + +// Highs returns a Series of all the high prices in the DataFrame. +func (o *DataFrame) Highs() Series { + return o.Series("High") +} + +// Lows returns a Series of all the low prices in the DataFrame. +func (o *DataFrame) Lows() Series { + return o.Series("Low") +} + +// Closes returns a Series of all the close prices in the DataFrame. +func (o *DataFrame) Closes() Series { + return o.Series("Close") +} + +// Volumes returns a Series of all the volumes in the DataFrame. +func (o *DataFrame) Volumes() Series { + return o.Series("Volume") +} + +// Series returns a Series of the column with the given name. If the column does not exist, nil is returned. +func (o *DataFrame) Series(name string) Series { + if o.data == nil { + return nil + } + colIdx, err := o.data.NameToColumn(name) if err != nil { return nil - } else if o.DataFrame == nil || i >= o.Len() { - return 0 - } else if i < 0 { - i = o.Len() - i - if i < 0 { - return 0 - } - return o.Series[colIdx].Value(i) } - return o.Series[colIdx].Value(i) + return &DataSeries{o.data.Series[colIdx]} +} + +// Value returns the value of the column at index i. The first value is at index 0. A negative value for i can be used to get i values from the latest, like Python's negative indexing. If i is out of bounds, nil is returned. +func (o *DataFrame) Value(column string, i int) interface{} { + if o.data == nil { + return nil + } + i = EasyIndex(i, o.Len()) // Allow for negative indexing. + colIdx, err := o.data.NameToColumn(column) + if err != nil || i < 0 || i >= o.Len() { // Prevent out of bounds access. + return nil + } + return o.data.Series[colIdx].Value(i) } // Float returns the value of the column at index i casted to float64. The first value is at index 0. A negative value for i (-n) can be used to get n values from the latest, like Python's negative indexing. If i is out of bounds, 0 is returned. @@ -175,13 +411,83 @@ func (o *DataFrame) Time(column string, i int) time.Time { } } -func NewChartData(data *df.DataFrame) *DataFrame { +func NewDataFrame(data *df.DataFrame) *DataFrame { return &DataFrame{data} } -type RollingWindow struct { - DataFrame - Period int +func (s *DataSeries) Copy() Series { + return &DataSeries{s.data.Copy()} +} + +func (s *DataSeries) Len() int { + if s.data == nil { + return 0 + } + return s.data.NRows() +} + +func (s *DataSeries) Rolling(period int) *RollingSeries { + return &RollingSeries{s, period} +} + +func (s *DataSeries) Value(i int) interface{} { + if s.data == nil { + return nil + } + i = EasyIndex(i, s.Len()) // Allow for negative indexing. + return s.data.Value(i) +} + +func (s *DataSeries) Float(i int) float64 { + val := s.Value(i) + if val == nil { + return 0 + } + switch val := val.(type) { + case float64: + return val + default: + return 0 + } +} + +func (s *DataSeries) Int(i int) int64 { + val := s.Value(i) + if val == nil { + return 0 + } + switch val := val.(type) { + case int64: + return val + default: + return 0 + } +} + +func (s *DataSeries) String(i int) string { + val := s.Value(i) + if val == nil { + return "" + } + switch val := val.(type) { + case string: + return val + default: + return "" + } +} + +func (s *DataSeries) Time(i int) time.Time { + val := s.Value(i) + if val == nil { + return time.Time{} + } + switch val := val.(type) { + case time.Time: + return val + default: + return time.Time{} + } } type DataCSVLayout struct { @@ -204,6 +510,19 @@ func ReadDataCSV(path string, layout DataCSVLayout) (*df.DataFrame, error) { return ReadDataCSVFromReader(f, layout) } +func ReadEURUSDDataCSV() (*df.DataFrame, error) { + return ReadDataCSV("./EUR_USD Historical Data.csv", DataCSVLayout{ + LatestFirst: true, + DateFormat: "01/02/2006", + Date: "\ufeff\"Date\"", + Open: "Open", + High: "High", + Low: "Low", + Close: "Price", + Volume: "Vol.", + }) +} + func ReadDataCSVFromReader(r io.Reader, layout DataCSVLayout) (*df.DataFrame, error) { data, err := ReadCSVFromReader(r, layout.DateFormat, layout.LatestFirst) if err != nil { diff --git a/data_test.go b/data_test.go index 99dd6a4..d3baa32 100644 --- a/data_test.go +++ b/data_test.go @@ -1,18 +1,63 @@ package autotrader -import "testing" +import ( + "testing" + "time" +) + +func newTestingDataFrame() *DataFrame { + _dataframe, err := ReadEURUSDDataCSV() + if err != nil { + return nil + } + return NewDataFrame(_dataframe) +} + +func TestDataSeries(t *testing.T) { + data := newTestingDataFrame() + if data == nil { + t.Fatal("Could not create DataFrame") + } + + dates, closes := data.Dates(), data.Closes() + + if dates.Len() != 2610 { + t.Fatalf("Expected 2610 rows, got %d", dates.Len()) + } + if closes.Len() != 2610 { + t.Fatalf("Expected 2610 rows, got %d", closes.Len()) + } + + sma10 := closes.Rolling(10).Mean() + if sma10.Len() != 2610 { + t.Fatalf("Expected 2610 rows, got %d", sma10.Len()) + } + if sma10.Value(-1) != 1.10039 { // Latest closing price averaged over 10 periods. + t.Fatalf("Expected 1.10039, got %f", sma10.Value(-1)) + } +} + +func TestDataFrame(t *testing.T) { + data := newTestingDataFrame() + if data == nil { + t.Fatal("Could not create DataFrame") + } + + if data.Len() != 2610 { + t.Fatalf("Expected 2610 rows, got %d", data.Len()) + } + if data.Close(-1) != 1.0967 { + t.Fatalf("Expected 1.0967, got %f", data.Close(-1)) + } + + date := data.Date(2) // Get the 3rd earliest date from the Date column. + if date.Year() != 2013 || date.Month() != 5 || date.Day() != 13 { + t.Fatalf("Expected 2013-05-13, got %s", date.Format(time.DateOnly)) + } +} func TestReadDataCSV(t *testing.T) { - data, err := ReadDataCSV("./EUR_USD Historical Data.csv", DataCSVLayout{ - LatestFirst: true, - DateFormat: "01/02/2006", - Date: "\ufeff\"Date\"", - Open: "Open", - High: "High", - Low: "Low", - Close: "Price", - Volume: "Vol.", - }) + data, err := ReadEURUSDDataCSV() if err != nil { t.Fatal(err) }