out of memory in golang when parsing Freebase RDF

639 Views Asked by At

I'm parsing through the triples of the Freebase RDF compressed and streaming with the XML package in Golang. However, I'm getting an out of memory error.

Do I have to garbage-collect? How can I do that? How can I clear the memory after I'm doing writing that triple to the XML file?

Here's my code: http://play.golang.org/p/dWvbtcs7wy

package main

import(
    "bufio"
    "flag"
    "fmt"
    "io"
    "net/url"
    "os"
    "regexp"
    "strings"
 )

 var inputFile = flag.String("infile", "freebase-rdf", "Input file path")
 var filter, _ = regexp.Compile("^file:.*|^talk:.*|^special:.*|^wikipedia:.*|^wiktionary:.*|^user:.*|^user_talk:.*")
type Redirect struct {
     Title string `xml:"title,attr"`
}

type Page struct {
    Title    string `xml:"title"`
    Abstract string `xml:""`
}

func CanonicaliseTitle(title string) string{
     can := strings.ToLower(title)
     can = strings.Replace(can, " ", "_", -1)
     can = url.QueryEscape(can)
     return can
 }

 func convertFreebaseId(uri string) string{
     if strings.HasPrefix(uri, "<") && strings.HasSuffix(uri, ">") {
        var id = uri[1 : len(uri)-1]
        id = strings.Replace(id, "http://rdf.freebase.com/ns", "", -1)
        id = strings.Replace(id, ".", "/", -1)
        return id
     }
     return uri
 }

 func parseTriple(line string) (string, string, string){
     var parts = strings.Split(line, "\t")
     subject := convertFreebaseId(parts[0])
     predicate := convertFreebaseId(parts[1])
     object := convertFreebaseId(parts[2])
     return subject, predicate, object
 }

 var (
     validRegexp = regexp.MustCompile("^[A-Za-z0-9][A-Za-z0-9_-]*$")
     englishRegexp = regexp.MustCompile("@en")
 )

 func validTitle(content []string) bool{
     for _, v := range content{
         if !englishRegexp.MatchString(v) && len(v) > 1 && strings.Index(v, "[]") != -1{
         }  
     }
     return true
  }

 func validText(content []string) bool{
     for _, v := range content{
         if !validRegexp.MatchString(v) && len(v) > 1 && strings.Index(v, "[]") != -1{
             return false
         }
     }
     return true
 }

 func processTopic(id string, properties map[string][]string, file io.Writer){
     if validTitle(properties["/type/object/name"]) && validText(properties["/common/document/text"]){
         fmt.Fprintf(file, "<card>\n")
         fmt.Fprintf(file, "<title>\"%s\"</title>\n", properties["/type/object/name"])
         fmt.Fprintf(file, "<image>\"%s/%s\"</image>\n", "https://usercontent.googleapis.com/freebase/v1/image", id)
         fmt.Fprintf(file, "<text>\"%s\"</text>\n", properties["/common/document/text"])
         fmt.Fprintf(file, "<facts>\n")
         for k, v := range properties{
             for _, value := range v{
                  fmt.Fprintf(file, "<fact property=\"%s\">%s</fact>\n", k, value)
             }
         }
         fmt.Fprintf(file, "</facts>\n")
         fmt.Fprintf(file, "</card>\n")
     }
 }

 func main(){
     var current_mid = ""
     current_topic := make(map[string][]string)
     f, err := os.Open(*inputFile)
     if err != nil {
         fmt.Println(err)
         return
     }
     r := bufio.NewReader(f)
     xmlFile, _ := os.Create("freebase.xml")
     line, err := r.ReadString('\n')
     for err == nil{
         subject, predicate, object := parseTriple(line)
         if subject == current_mid{
             current_topic[predicate] = append(current_topic[predicate], object)
         }else if len(current_mid) > 0{
              processTopic(current_mid, current_topic, xmlFile)
              current_topic = make(map[string][]string)
         }
         current_mid = subject
         line, err = r.ReadString('\n')
     }
     processTopic(current_mid, current_topic, xmlFile)
     if err != io.EOF {
         fmt.Println(err)
         return
     }
 }
1

There are 1 best solutions below

0
On

I'm not sure that this is your problem, although reading your code it seems you're not leaking anything - but you can tune GC behavior a bit with SetGCPercent() http://golang.org/pkg/runtime/debug/#SetGCPercent

According to TFM, a collection is triggered when the ratio of freshly allocated data to live data remaining after the previous collection reaches this percentage.. The default rate is 100%, meaning for programs that make lots of small allocations and hold lots of RAM, the overhead can be huge. I had an HTTP cache take up over 200% the cache size once. Try tuning the percentage to somewhere around 10% and see if it helps.