Statistics/OpenStat/Research projects/Formatting rules and function for importing into R
What is the most easyiest and consintent way defining timedependet datasets and importing into R?
Data structures as the MAMEU example can be imported e.g. with the following R-script (click edit to conserve formating):
library(XML)
wikiurl <- 'http://en.wikiversity.org/w/index.php?title=Topic:Statistics:OpenStat:MAMEU&action=edit'
page <- htmlTreeParse(
readLines(
url(wikiurl),encoding='UTF-8'
),
asText=TRUE
)
part <- (page$children$html$children$body$children$div$children$div$children$div['div',all=TRUE][2]$div$children)
textfeld <- part$form$children$textarea$children$text$value
#split by main titles
textfeld <- strsplit(as.character(textfeld),'==')[[1]]
dataname <- gsub('=','',as.character(textfeld[1]))
dataname <- gsub('\n','',as.character(dataname[1]))
dataname <- gsub('Data ','',as.character(dataname[1]))
dataname
description <-textfeld[3]
description
### make dataframe
eval(parse(text=paste(dataname,' <- data.frame(year=1:2100)')))
### make dataframe comment
eval(parse(text=paste('comment(',dataname,') <-\"',description,'\"')))
#
### descripion of the structure
variablestructure <- textfeld[5]
### referenceslist
references <- textfeld[7]
### the folloing variables
counter <- (1:length(textfeld))[-c(1:7)]
counter <- seq(from=counter[1],to=counter[length(counter)],by=2)
### for each variable
for (count in counter) {
variablename <- gsub('^ *= *','',textfeld[count])
variablename <- gsub(' ','_',variablename)
variablename <- gsub('\\(','',variablename)
variablename <- gsub('\\)','',variablename)
variablename <- gsub('/','_',variablename)
### variablename
print(variablename)
#eval(parse(text=paste(variablename,' <- numeric()',sep='')))
#print(ls())
vardatalist <- unlist(strsplit(textfeld[count+1],'\n'))
missing <- c(grep('^=$',vardatalist),grep('^-*$',vardatalist))
missing
vardatalist <- vardatalist[-missing]
varlist <- strsplit(vardatalist,'\\|')
varrefyear <- lapply(1:length(varlist),function(number){ varlist[[number]][1] })
varrefyear <- as.numeric(gsub(' *','',varrefyear))
vardata <- lapply(1:length(varlist),function(number){ varlist[[number]][2] })
vardata <- as.numeric(gsub(' *','',vardata))
varsource<- lapply(1:length(varlist),function(number){ varlist[[number]][3] })
varcomment <- paste(lapply(1:length(varlist),function(number){ varlist[[number]][4] }),lapply(1:length(varlist),function(number){ varlist[[number]][5] }))
#varcomment
varsource
eval(parse(text=paste(dataname,'$',variablename,'[',dataname,'$','year==',varrefyear,'] <- ',vardata,sep='')))
}
print(eval(parse(text=dataname)))