0
我只是從solr節點讀取記錄。我的代碼只在給定的日期範圍內讀取記錄。我已檢查,它正在爲50K記錄工作,但我嘗試了100k,然後超過發現GC超載限制。如何解決scala代碼中超出的GC超載限制
我的代碼是這樣的斯卡拉:
def querySolr(core: String, selectQuery: String, server: SolrClient,
pageNum: Int, pageStart: Int, pageSize: Int): (Long, SolrDocumentList) = {
val query = new SolrQuery(core)
query.setQuery(selectQuery)
query.setStart(pageStart)
query.setRows(pageSize)
val response: QueryResponse = server.query(query)
val results: SolrDocumentList = response.getResults
val total = results.getNumFound
(total, results)
}
def pageCalc(page: Int, pageSize: Int, totalItems: Long): (Int, Long, Long) = {
val from = ((page - 1) * pageSize) + 1
val to = totalItems min (from + pageSize - 1)
val totalPages = (totalItems/pageSize) + (if (totalItems % pageSize > 0) 1 else 0)
(from, to, totalPages)
}
def getRecordsFromSolr(core: String, solrhost: String, userName: String, password: String,
query: String): List[SolrDocument] = {
val startTime = System.nanoTime()
val url = "https://" + solrhost + ":8983/solr/" + core
val solrPort = 8983
val builder: SSLContextBuilder = new SSLContextBuilder()
builder.loadTrustMaterial(null, new TrustSelfSignedStrategy())
val sslsf: SSLConnectionSocketFactory = new SSLConnectionSocketFactory(
builder.build(), SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER
)
val credsProvider: CredentialsProvider = new BasicCredentialsProvider()
credsProvider.setCredentials(
new AuthScope(solrhost, solrPort),
new UsernamePasswordCredentials(userName, password))
val httpclient: CloseableHttpClient =HttpClients.custom().setSSLSocketFactory(sslsf).setDefaultCredentialsProvider(credsProvider).build()
val server: SolrClient = new HttpSolrClient(url, httpclient)
logger.info("solr connection completed")
val pageSize = 1000
var pageNum = 1
var nextPage: (Int, Long, Long) = (0, 1000, 0)
var offset: Long = 0
var totalResult = querySolr(core, query, server, pageNum, 0, pageSize)
var total = totalResult._1
var results: List[SolrDocument] = totalResult._2.toList
while (total > offset) {
offset += pageSize
pageNum += 1
nextPage = pageCalc(pageNum, pageSize, total)
totalResult = querySolr(core, query, server, pageNum, nextPage._1, pageSize)
total = totalResult._1
results = (results ++ totalResult._2.toList)
}
}
java.lang.OutOfMemoryError:GC開銷超過限制
如何避免內存泄漏。我嘗試每個核心8GB。和表包含數百萬條記錄。
我發現下面的錯誤60K記錄:讀Solr的響應,其過大時出現
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Serialized task 0:0 was 18311053 bytes, which exceeds max allowed: spark.akka.frameSize (10485760 bytes) - reserved (204800 bytes). Consider increasing spark.akka.frameSize or using broadcast variables for large values.