import workerpool from 'workerpool';
import connect from './db/connection';

import * as Constants from './shared/constants';
import * as Utils from './shared/utils';
import * as Config from './config';

import * as ReferrersController from './controllers/referrers';
import * as CrawlOutputsController from './controllers/crawl-outputs';
import * as SearchEndpointsController from './controllers/search-endpoints';
import * as AuditController from './controllers/audits';
import * as MananagerController from './controllers/manager';
import * as PayloadsController from './controllers/payloads';
import * as WorkItemsController from './controllers/workitems';

import { IPayload } from './interfaces/core/payload.interface';
import { IReferrerAudit } from './interfaces/referrers/audit.interface';
import { IDomainReferrer } from './interfaces/referrers/domain.interface';
import { IReferrerCrawlOutput } from './interfaces/referrers/referrer-crawl-output.interface';
import { IWorkerOutput } from './interfaces/referrers/worker-output.interface';
import { HtmlOutput } from 'referrers/html-output';
import { ISearchEndpoint } from './interfaces/referrers/search-endpoint.interface';
import { ClickSearchElement } from 'referrers/click-search-element';

const batchSize = Config.Crawler.batchSize;
const maxWorkers = Config.Crawler.maxWorkers;
const loopInterval = Config.Crawler.loopInterval;

const db: string = `mongodb://${Config.Database.db_host}:${Config.Database.db_port}/${Config.Database.db_name}`;

connect(db);

const pool = workerpool.pool(__dirname + '/worker.js', {minWorkers: 1, maxWorkers: maxWorkers});
const workTobeDone: Array<string> = [];
const allWorkDoneInThisPool: Array<string> = [];

let ongoingWork: Array<string> = [];
let loopIval : any = undefined;
let crawlerBatchStartTime: string = undefined;


const getWork = async (batchSize: number = 0) : Promise<Array<string>> => {

    
    try {

        const crawlerManager = await MananagerController.getManagerWhere({});
        
        if(!crawlerManager || crawlerManager.doWork === 0)
            return [];
   
        const referrer_items = await ReferrersController.getDomainsBatch({status: Constants.CVE_URL_STATUS.NOT_SET}, batchSize);

        if(referrer_items && referrer_items.length)
        {      
            await Promise.all(referrer_items.map(referrer_item => ReferrersController.updateOne({domain: referrer_item.domain}, {status: Constants.REFERRERS_CRAWLER.CRAWL_STATUS.QUEUED})));                             
            return referrer_items.map(d => d.domain);
        }

        return [];
    
    } catch (err) {
        throw(err);
    }
      
}

const nexWork = () : string => {

	const work = workTobeDone.pop();

    if(typeof work === 'string')
    {        
        return work;
    }
    else
    {
        return undefined;
    }

}

const crawlerLoop = async () => {
			
	const stats = pool.stats();
    
    console.log(`current jobs stats:\n #idleWorkers:${stats.idleWorkers}, #busyWorkers:${stats.busyWorkers}, #workTobeDone:${workTobeDone.length}`);

    if(stats.busyWorkers < maxWorkers && workTobeDone.length > 0)
    {
        const newWorkersBatch = (maxWorkers - stats.busyWorkers) < workTobeDone.length ? (maxWorkers - stats.busyWorkers) : workTobeDone.length;

        console.log(`launching ${newWorkersBatch} new workers.`);
        
        for(let i = 0; i < newWorkersBatch; i++)
        {            
            let nextWorkItem = nexWork();            
                        
            if(nextWorkItem) {

                    console.log(`starting work for: ${nextWorkItem}`); 
                    ongoingWork.push(nextWorkItem);
                    allWorkDoneInThisPool.push(nextWorkItem);                   
                    await WorkItemsController.create({domain: nextWorkItem, runId: Config.ReferrersCrawler.crawlRunId, start: new Date().toUTCString(), end: null });
                                                                                               
                    pool.exec('doWork', [nextWorkItem])                
                    .then(async function (workerData: IWorkerOutput) { 
                                                             
                        await WorkItemsController.updateLastWorkItem({domain: workerData.domain}, {end: new Date().toUTCString()});	
                                        
                        if(!workerData || workerData.error)
                            throw(`error working domain: ${workerData.domain || undefined}`);
                        else
                        {
                            const result = await handleCrawlResponse(workerData);
                            return result;
                        }                                    
                    })
                    .catch(async function (err) {
                        console.error(`uncaught error: ${err}`);
                                              
                        await persistAudit({                        
                            domain: undefined,
                            type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.CRAWLER_MANAGER_ERROR,
                            context: Constants.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.WORKERS_POOL_EXCEPTION,
                            metadata: JSON.stringify({message: err.message, stack: err.stack })                                            
                        } as IReferrerAudit);
                    })
                    .then(async function (result: any) {                            
                           if(result && result.domain && result.status) {                                                  
                            await persistCrawlStatus(result.domain, result.status);
                            ongoingWork = await Utils.removeFromArrayListAsync(result.domain, ongoingWork);
                            console.log(`work finished for domain: ${result.domain}`);   
                        }       
                    }).timeout(Config.Crawler.workerTimeout);
                                
            }                    
        }
    }
    else
    if(workTobeDone.length === 0 && loopIval)
    {            
        try {
            
            loopIval = clearInterval();
            console.log(`no work left to be done, getting more work...`);    
       
            const moreWork = await getWork(batchSize);
                                            
            if(moreWork && moreWork.length > 0)
            {                                            
                const workDelta = Utils.differenceWith(moreWork, ongoingWork);        
                workTobeDone.push.apply(workTobeDone, workDelta);                
            }
           
            if(allWorkDoneInThisPool && allWorkDoneInThisPool.length > 0)
            {                             
                allWorkDoneInThisPool.length = 0;                
            }

            crawlerBatchStartTime = new Date().toUTCString();              
       
        } catch (err) {
            await persistAudit({
                type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.GETTING_MORE_WORK_EXCEPTION,
                runId: Config.ReferrersCrawler.crawlRunId
            } as IReferrerAudit);
        } finally {
                        
            //restart loop
            startLoop(loopInterval, crawlerLoop);  
        }
                      
    }   
}

const startLoop = (delay: number, loop: any) : void => {
    // ensure loop is started at most once.
    if (!loopIval) {
        loop();
        loopIval = setInterval(loop, delay);
    }
}

const persistAudit = async (audit: IReferrerAudit) => {

    try {        
        audit.runId = Config.ReferrersCrawler.crawlRunId;
        await AuditController.create(audit);
    } catch (error) {
        AuditController.create({
            type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.PERSIST_EXCEPTION,
            runId: Config.ReferrersCrawler.crawlRunId,
            domain: audit.domain 
        } as IReferrerAudit);
    }
}

const redactPayloads = async (domain: string, dataInterceptions: Array<any>, kbThreshold: number) => {

    const redact_exceptions = [];

    try {
      
      if(!dataInterceptions || !dataInterceptions.length)
        return dataInterceptions;
        
		for(const item of dataInterceptions)
		{
			try {

				if(Utils.isDataUrl(item.url))
				{									
					item.url = Utils.redactDataURLWithHash(item.url);
				}
						
				if(item.payload)
				{               
					
				const kBytes = Utils.getKBytes(item.payload);
		
				if(kBytes >= kbThreshold)
				{
					const hashRefPayload = Utils.generateHash(`${item.payload}`);	
					const payloadRecord = {'hash': hashRefPayload, 'data': item.payload} as IPayload;
					if(kBytes > Config.ReferrersCrawler.largePayloadsSize)
					{
						await Utils.writeToFileAsync(`${Config.ReferrersCrawler.largePayloadsPath}/${hashRefPayload}.json`, payloadRecord);  
					}		
					else
					{
						await PayloadsController.create(payloadRecord);
					}	
					item.payload = `hash:${hashRefPayload}`;
					item.payload_redacted = true;
		
				}
				
				}  
			} catch (e) {			                           
                redact_exceptions.push({
                    domain, 
                    type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.CRAWLER_MANAGER_ERROR,
                    context: Constants.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.REDACTING_PAYLOAD_ITEM, 
                    metadata: JSON.stringify({message: e.message, stack: e.stack }
                )});
            }               
		}
  
      return dataInterceptions;
      
    } catch (e) {
       throw(e);
    }
    finally {
       
        if(redact_exceptions.length > 0)
        {
            let promises = redact_exceptions.map((exception_audit) => persistAudit(exception_audit as IReferrerAudit));							
            await Promise.all(promises.map(p => p.catch(e => e)));	
        }
    }
    
}

const handleCrawlResponse = async (workerResult: IWorkerOutput) => {

    const crawlStatusResult: any = {
        domain: undefined,
        exception: undefined,
        status: Constants.CRAWL_STATUS.NOT_SET        
    };

    try {

        crawlStatusResult.domain = workerResult.domain;  
        crawlStatusResult.status = workerResult.crawl_status;

        const { 
            domain, 
            url, 
            lang_codes,
            html, 
            search_inputs, 
            search_clicks, 
            searchEndpoint,
            preFlightHttpCode,            
            html_step_status,
            lookup_step_status,           			             
            clicks_search_step_status,
            search_endpoint_step_status,
            crawl_exceptions,     
            start, 
            end 
        } : IWorkerOutput = workerResult;

    
        const htmlHttpCode = html ? (html as HtmlOutput).statusCode : -1;
    
        // audit for all steps performed.
        await persistAudit({
            domain,
            preFlightHttpCode,
            htmlHttpCode,                         
            html_step_status,
            lookup_step_status,
            clicks_search_step_status,				 
            search_endpoint_step_status,                  
            startCrawlTime: start.toUTCString(),
            endCrawlTime: end.toUTCString(),                     
            type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.INFO,
            metadata: undefined           
        } as IReferrerAudit);
                                      
        if(crawlStatusResult.status > Constants.REFERRERS_CRAWLER.CRAWL_STATUS.NOT_SET && ((preFlightHttpCode && preFlightHttpCode < 400) || (htmlHttpCode && htmlHttpCode < 400)))				                
        {			       
            if(html && (html as HtmlOutput).requestInterceptions)
                (html as HtmlOutput).requestInterceptions = await redactPayloads(domain, (html as HtmlOutput).requestInterceptions, 100);

            if(html && (html as HtmlOutput).responseInterceptions)
                (html as HtmlOutput).responseInterceptions = await redactPayloads(domain, (html as HtmlOutput).responseInterceptions, 100);	
    
            if(html && (html as HtmlOutput).requestsChain)
            {
                for(const [requestId_Key, requestData] of Object.entries((html as HtmlOutput).requestsChain) as any)
                {               
                    requestData.url = Utils.redactDataURLWithHash(requestData.url);
                }
            
            }
                        
            if(search_clicks && (search_clicks as ClickSearchElement[]).length)
            {
                for(const search_click_item of (search_clicks as ClickSearchElement[])){
    
                    if(search_click_item && (search_click_item as ClickSearchElement).requestsChain)
                    {
                        for(const [requestId_Key, requestData] of Object.entries((search_click_item as ClickSearchElement).requestsChain))
                        {               
                            requestData.url = Utils.redactDataURLWithHash(requestData.url);
                        }
                    
                    }
                    
                    (search_click_item as ClickSearchElement).requestInterceptions = await redactPayloads(domain, (search_click_item as ClickSearchElement).requestInterceptions, 100);
                    (search_click_item as ClickSearchElement).responseInterceptions = await redactPayloads(domain, (search_click_item as ClickSearchElement).responseInterceptions, 100);	
                }
            }
                     
            try {

                let statusOfSearch = Constants.CRAWL_STATUS.NOT_SET;
                
                if(clicks_search_step_status === Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_CLICKS_VALID
                    || search_endpoint_step_status === Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_URL_ENDPOINT_SUCCESS)
                    statusOfSearch = Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SUCCESS;
                else
                    if(lookup_step_status === Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_FOUND)
                        statusOfSearch = Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_FOUND;
                else
                    if(lookup_step_status === Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_VALID)
                        statusOfSearch = Constants.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_VALID;
                
                                             
                const newCrawlerResult: IReferrerCrawlOutput = {					 	
                    domain,
                    url,
                    lang_codes,
                    html: JSON.stringify(html),
                    search_inputs: JSON.stringify(search_inputs),
                    search_clicks: JSON.stringify(search_clicks),
                    status: statusOfSearch,                                      
                    start: start,
                    end: end,
                    runId: Config.ReferrersCrawler.crawlRunId 						
                }; 

                if(Config.ReferrersCrawler.storeLocal && Config.ReferrersCrawler.storeLocal === 1)
                {
                    const ouputLogDir = Utils.generateLogDir(Config.ReferrersCrawler.outputsPath, url);   
                    await Utils.writeToFileAsync(`${ouputLogDir}/${domain}-${Config.ReferrersCrawler.crawlRunId}-html.json`, html);
                    await Utils.writeToFileAsync(`${ouputLogDir}/${domain}-${Config.ReferrersCrawler.crawlRunId}-search_inputs.json`, search_inputs);
                    await Utils.writeToFileAsync(`${ouputLogDir}/${domain}-${Config.ReferrersCrawler.crawlRunId}-search_clicks.json`, search_clicks);	
                }
                else
                {
                    CrawlOutputsController.create(newCrawlerResult);              	
                }
                

            } catch (e) {

                await persistAudit({
                    domain: domain || null,
                    type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.CRAWLER_MANAGER_ERROR,
                    context: Constants.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.PERSISTING_CRAWL_NOSQL,
                    metadata: JSON.stringify({message: e.message, stack: e.stack }) 
                } as IReferrerAudit);
                
                const ouputLogDir = Utils.generateLogDir(Config.ReferrersCrawler.outputsPath, url);   
                await Utils.writeToFileAsync(`${ouputLogDir}/${domain}-${Config.ReferrersCrawler.crawlRunId}-html.json`, html);
                await Utils.writeToFileAsync(`${ouputLogDir}/${domain}-${Config.ReferrersCrawler.crawlRunId}-search_inputs.json`, search_inputs);
                await Utils.writeToFileAsync(`${ouputLogDir}/${domain}-${Config.ReferrersCrawler.crawlRunId}-search_clicks.json`, search_clicks);	
            }																																			
                    
            if(searchEndpoint)
            {
                try {
                    
                    await SearchEndpointsController.create({                    
                        domain,
                        regexMatch: searchEndpoint.regexMatch,
                        url: searchEndpoint.url,
                        type: searchEndpoint.type,
                        status: searchEndpoint.type,
                        runId: Config.ReferrersCrawler.crawlRunId 	
                    } as ISearchEndpoint);

                } catch (e) {
                    throw(e);
                }            
            }                                                                 
        }
       
        if(html && (html as HtmlOutput).exceptions && (html as HtmlOutput).exceptions.length > 0)
        {
            try {
         
                let promises = (html as HtmlOutput).exceptions.map((exception_audit) => persistAudit({
                    domain: domain,
                    type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.HEADLESS_BROWSER_ERROR,               
                    metadata: JSON.stringify(exception_audit),
                } as IReferrerAudit));			

                await Promise.all(promises.map(p => p.catch(e => e)));	
                                                                                               
            } catch (error) {
                throw(error);
            }         
        }

        if(crawl_exceptions && crawl_exceptions.length)
        {
            try {

                let promises = crawl_exceptions.map((exception_audit: IReferrerAudit) => persistAudit(exception_audit));							
                await Promise.all(promises.map(p => p.catch(e => e)));		
                                                       
            } catch (error) {
                throw(error);
            }                  		
        }
                                                                                         
    } catch (error) {
       
        crawlStatusResult.exception = {
            domain: workerResult.domain,
            type: Constants.REFERRERS_CRAWLER.AUDIT_TYPES.HEADLESS_BROWSER_ERROR,     
            metadata: JSON.stringify({message: error.message, stack: error.stack }) 
        }       
    } 
    finally {

        if(crawlStatusResult.exception)
        {
            await persistAudit(crawlStatusResult.exception);
        }
            
        return crawlStatusResult;
    }   
}

const persistCrawlStatus = async (domain : string, crawl_status: number) => {
    
    try {  
        await ReferrersController.updateOne({domain},{status: crawl_status, lastRunId: Config.ReferrersCrawler.crawlRunId} as IDomainReferrer)        	
    } catch (e) {
        throw(e);
    }
}

(async () => {
    process.setMaxListeners(0);
    startLoop(loopInterval, crawlerLoop);
})();
