
import workerpool from 'workerpool';
import _ from 'lodash';
import * as Config from './config';
import * as Consts from './shared/constants';
import * as Utils from './shared/utils';
import * as BrowserUtils from './shared/browser-utils';
import * as Tasks from './tasks/referrers-tasks';
import { ClickSearchElement } from 'referrers/click-search-element';
import { HtmlOutput, IHtmlOutputError } from 'referrers/html-output';
import { SearchLookup, ISearchLookupError } from 'referrers/search-input-lookup';
import { IWorkerOutput } from './interfaces/referrers/worker-output.interface';

/**
 * transforms domain into a url.
 * @param {string} domain 
 */
const prepDomainUrl = (domain: string) => {
	return `http://${domain}`;	
}

/**
 * attempt to build a search url out of the extracted regex that matches search urls and the parent domain
 * @param {string} domain 
 * @param {string} url 
 */
const buildSearchUrl = (domain:string, url:string) => {
	
	let searchUrl = undefined;

	try {		
		searchUrl = new URL(url).href;		
	} catch {
		searchUrl = `${prepDomainUrl(domain)}${url.indexOf('/') === 0 ? url : '/' + url}`;
	}

	return searchUrl;	
	
}

/**
 * looks for a search url endpoint format a html input.
 * https://trends.builtwith.com/widgets/site-search
 * @param {string} html 
 */
const lookupSearchUrl = (html: string) => {

	try {

	 if(!html || html.length === 0)
		return null;

	  const searchEndpointTextMatch = html.match(Config.ReferrersCrawler.findSearchEndpointRegex);
	
	  if(searchEndpointTextMatch && searchEndpointTextMatch.length)
	  {   
		 return searchEndpointTextMatch[0];
	  }

	  return null;

  } catch(e) {
	 throw(e);
  }
  
}

/**
 * extracts the url from a search url endopoint container
 * https://trends.builtwith.com/widgets/site-search
 * @param {string} searchEndpointTextMatch 
 */
const parseSearchUrl = (searchEndpointTextMatch: string) => {

	try {
	
		let searchUrl = null;		
		const targetURLRegex = /" *target *" *: *['"][^'"]+/gmi;
		const trimed = searchEndpointTextMatch.trim();		   
		const url = trimed.match(targetURLRegex); 
		if(url && url.length > 0)
		{						
			searchUrl = url[0].replace(/"target*" *: *"/gim, '');	
		}

		return searchUrl;

	} catch {
		return null	
	}
		
}

/**
 * gets all language codes matching the provided tld
 * @param {string} tld 
 * @param {object} countries 
 */
const getDomainCountryCodes = (tld: string, countries: Array<any>): Array<string> => {

	const languageCodes: Array<string> = [];
						
	if(tld)
	{		
		const matchingCountries = countries.filter((d) => d.tld === tld);
		if(matchingCountries && matchingCountries.length > 0)
		{						
			matchingCountries.forEach(element => {				
				const country_languages = element.langs.map((d: any)=> d.code); 																									
				languageCodes.push.apply(languageCodes, country_languages);
			});
		}
	}

	return languageCodes;
}

/**
 * where all crawling steps take place.
 * @param {string} domain
 * @returns {Promise<IWorkerOutput>} 
 */
const doCrawlingWork = async (domain: string): Promise<IWorkerOutput> => {

	console.log(`preparing DOMAIN: ${domain}`);

	const crawlerResult = {

		preFlightHttpCode: -1, 

		domain, url: null,
		lang_codes: null,
		html: null,
		search_inputs: null, 
		search_clicks: null, 
		crawl_exceptions: null,
		searchEndpoint: null,

		html_step_status: Consts.REFERRERS_CRAWLER.CRAWL_STATUS.NOT_SET,
		lookup_step_status: Consts.REFERRERS_CRAWLER.CRAWL_STATUS.NOT_SET,
		clicks_search_step_status: Consts.REFERRERS_CRAWLER.CRAWL_STATUS.NOT_SET,
		search_endpoint_step_status: Consts.REFERRERS_CRAWLER.CRAWL_STATUS.NOT_SET,

		crawl_status: Consts.REFERRERS_CRAWLER.CRAWL_STATUS.NOT_SET
	} as IWorkerOutput;

	const url = prepDomainUrl(domain);
	const domain_tld = Utils.extractTLD(domain);	
	const exceptions = [];
	
	let language_codes = [];
	let usedSelectors: Array<HTMLInputElement> = [];
	let nonInputsSelectors: Array<HTMLInputElement> = [];
	let browser: any = undefined;
	let preFlightDetails = undefined;		

	try {

		process.on('unhandledRejection', error => {
            throw error;
		});

		try {
			preFlightDetails = await Utils.preflightRequest(url, 10000);			
		} catch (error) {
			console.error(error);
		}
								
		if((preFlightDetails.error && preFlightDetails.error.code) || (preFlightDetails.status && preFlightDetails.status >= 400))
		{		
			const preFlightException = {
				domain,
				type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
                context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.PREFLIGHT_REQUEST
             
			} as any

			if(preFlightDetails.error)
			{
				const { code } = preFlightDetails.error;

				switch(code)
				{
					case Consts.SYSTEM_ERRORS.ENOTFOUND: { crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.DNS_ERROR } break;
					case Consts.SYSTEM_ERRORS.ETIMEDOUT: { crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.DNS_ERROR } break;
					case Consts.SYSTEM_ERRORS.ECONNREFUSED: { crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.DNS_ERROR } break;
					case Consts.SYSTEM_ERRORS.EAI_AGAIN: { crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.DNS_ERROR } break;
					default: { crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.ERROR } 				
				}
				
				preFlightException.metadata = JSON.stringify(preFlightDetails.error); 									
			}
			else
			if(preFlightDetails.status)
			{
				crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.INVALID_HTTP_STATUS_CODE
				preFlightException.metadata = preFlightDetails.status;				
			}
			else
			{
				crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.ERROR
				preFlightException.metadata = preFlightDetails;
			}

			exceptions.push(preFlightException);
		}
		else
		if(preFlightDetails && preFlightDetails.timeout && preFlightDetails.timeout === 1)
		{
			crawlerResult.preFlightHttpCode = preFlightDetails.timeout;
			crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.ERROR
		}
		else
		{		
			crawlerResult.preFlightHttpCode = preFlightDetails.status;
			
			try {

				browser = await BrowserUtils.getBrowser(); 
		
			} catch (error) {
	
				exceptions.push({
					domain,
					type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
					context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.STARTING_BROWSER,
					metadata: JSON.stringify({message: error.message, stack: error.stack }) 
				});
	
				throw(error);			
			}
			
			language_codes.push(Config.ReferrersCrawler.defaultLanguageCode);		
			language_codes.push(...getDomainCountryCodes(domain_tld, Consts.REFERRERS_CRAWLER.LANGUAGE_MAPPINGS.countries));
													  
			// Stage 1 - get the HTML
			try {
					console.log(`STEP 1: ${url}`);				
					crawlerResult.html = await Tasks.getHTMLContent(browser, url, Config.ReferrersCrawler.pageTimeout); 														
					crawlerResult.html_step_status = (crawlerResult.html as HtmlOutput).status.error === 0 ? Consts.REFERRERS_CRAWLER.CRAWL_STATUS.HTML :
					Consts.REFERRERS_CRAWLER.CRAWL_STATUS.ERROR;
					await BrowserUtils.closeOpenPages(browser);
					
			} catch (e) {
							
				exceptions.push({
					domain,
					type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
					context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.EXTRACTING_HTML_STEP,
					metadata: JSON.stringify({message: e.message, stack: e.stack }) 
				});
	
				(crawlerResult.html as HtmlOutput) = { error: 1 } as IHtmlOutputError;
			}
			
			// Stage 2 - find inputs.
			try {					
				console.log(`STEP 2: ${url}`);
	
				if(crawlerResult.html && (crawlerResult.html as HtmlOutput).html) {
					const html_lang_attr_value = Utils.extractLanguageFromHTML((crawlerResult.html as HtmlOutput).html);										
					if(html_lang_attr_value && html_lang_attr_value.length > 0 && !language_codes.includes(html_lang_attr_value))
					{						
						language_codes.push(html_lang_attr_value);					
					}						
				}
	
				language_codes = Utils.removeDuplicates(language_codes.map(d=>d.length == 2 ? d.toLowerCase() : d));						
				
				crawlerResult.search_inputs = await Tasks.searchElementLookup(browser, url, Config.ReferrersCrawler.pageTimeout, language_codes);											
				
				if((crawlerResult.search_inputs as SearchLookup).status.error === 1)
					crawlerResult.lookup_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.ERROR;
				else
				if((crawlerResult.search_inputs as SearchLookup).searchInputs.length === 0)
					crawlerResult.lookup_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_FOUND;
				else
				{			
				
					let searchInputs = (crawlerResult.search_inputs as SearchLookup).searchInputs;
					
					for(let i = 0; i < searchInputs.length; i++) {
						let relevant_search_inputs = searchInputs[i].inputs.filter((d:HTMLInputElement) => d.type == 'text' || d.type == 'search');            
						usedSelectors.push(...relevant_search_inputs);
						nonInputsSelectors.push(...searchInputs[i].inputs.filter((d:HTMLInputElement) => d.type != 'text' && d.type != 'search'));    						
					}
							
					usedSelectors = _.uniqBy(usedSelectors, (input: HTMLInputElement) => {
						return Utils.buildQuerySelector(input,'input');
					});
	
					crawlerResult.lookup_step_status = 
					usedSelectors.length > 0 ? Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_FOUND : Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_VALID;
	
				}
								
				await BrowserUtils.closeOpenPages(browser);
				
			} catch (e) {						
				
				exceptions.push({
					domain,
					type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
					context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.LOOKUP_INPUTS_STEP,
					metadata: JSON.stringify({message: e.message, stack: e.stack }) 
				});														
			
				crawlerResult.search_inputs = { error: 1 } as ISearchLookupError;
			}
		
			if(crawlerResult.lookup_step_status === Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_NOT_VALID) {
		
				let allPossilbeSearchInputWrapperSelectors = _.uniqBy(nonInputsSelectors, (input: any) => {					
					return Utils.buildGenericQuerySelector(input, input.tagName);										
				});		
				
				allPossilbeSearchInputWrapperSelectors = allPossilbeSearchInputWrapperSelectors.slice(0, 5);	

				let outputs: any = [];
													
					for (let i = 0; i < allPossilbeSearchInputWrapperSelectors.length; i += Config.ReferrersCrawler.searchJobsBatch) {		
												
						try {
					
							let batch = allPossilbeSearchInputWrapperSelectors.slice(i, i + Config.ReferrersCrawler.searchJobsBatch);																			
							let promises = batch.map((searchElement) => Tasks.clickAndWaitForSearchElement(url, searchElement, Config.ReferrersCrawler.pageTimeout, Config.ReferrersCrawler.searchQuery));							
							const results = await Promise.all(promises.map(p => p.catch(e => e)));														
							outputs.push.apply(outputs, results.filter(result => result && !(result instanceof Error)));

							if(outputs.filter((d: any) => { return d.status && !!!d.status.error; }).length > 0)
							{									
								crawlerResult.lookup_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_FOUND;
							}																																																				
																	
						} catch (e)  {
							exceptions.push({
								domain,
								type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
								context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.CLICK_SEARCH_INPUT_LOOP_STEP,
								metadata: JSON.stringify({message: e.message, stack: e.stack }) 
							});												
						}																																	
					}																											
	
			}
		
			if(crawlerResult.lookup_step_status === Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_INPUTS_FOUND)
			{			
				try {
	
					console.log(`STEP 3: ${url}`);
	
					let outputs: any = [];
													
					for (let i = 0; i < usedSelectors.length; i += Config.ReferrersCrawler.searchJobsBatch) {		
												
						let batch = usedSelectors.slice(i, i + Config.ReferrersCrawler.searchJobsBatch);
						let promises = undefined;
						let results = [];

						try {
																						
							promises = batch.map((searchElement) => Tasks.clickSearchElement(browser, url, searchElement, Config.ReferrersCrawler.pageTimeout, Config.ReferrersCrawler.searchQuery));							
							results = await Promise.all(promises.map(p => p.catch(e => e)));														
							outputs.push.apply(outputs, results.filter(result => result && !(result instanceof Error)));
																							
						} catch (e)  {
							exceptions.push({
								domain,
								type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
								context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.CLICK_SEARCH_INPUT_LOOP_STEP,
								metadata: JSON.stringify({message: e.message, stack: e.stack }) 
							});																			
						}
						
						try {
							//NEW STEP
							promises = batch.map((searchElement) => Tasks.clickSearchElementAnotherTab(browser, url, searchElement, Config.ReferrersCrawler.pageTimeout, Config.ReferrersCrawler.searchQuery));							
							results = await Promise.all(promises.map(p => p.catch(e => e)));														
							outputs.push.apply(outputs, results.filter(result => result && !(result instanceof Error)));
						} catch (e) {
							exceptions.push({
								domain,
								type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
								context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.CLICK_SEARCH_INPUT_LOOP_ANOTHER_TAB_STEP,
								metadata: JSON.stringify({message: e.message, stack: e.stack }) 
							});		
						}
					
					}					
																
					if (outputs && outputs.length > 0) {															
								
						if(outputs.filter((d: any) => { return d.status && !!!d.status.error; }).length > 0)																																												
							crawlerResult.search_clicks = outputs.slice(0, 5);																							
						else
						{
							try {
																		
								const searchAndClickResult = await Tasks.searchLookupAndClickSearchElement(browser, url, Config.ReferrersCrawler.pageTimeout, Config.ReferrersCrawler.searchQuery, language_codes);																																																					
								
								if(searchAndClickResult.search_inputs_result &&  searchAndClickResult.search_inputs_result.length > 0 && searchAndClickResult.search_clicks_result)
								{
									crawlerResult.search_inputs = searchAndClickResult.search_inputs_result;
									crawlerResult.search_clicks = [searchAndClickResult.search_clicks_result];
								}
																						
							} catch (e) {									
								exceptions.push({
									domain,
									type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
									context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.LOOKUP_AND_CLICK_SEARCH_INPUT_STEP,
									metadata: JSON.stringify({message: e.message, stack: e.stack }) 
								});
							}																					
						}											
					}		
	
					if(crawlerResult.search_clicks && (crawlerResult.search_clicks as Array<ClickSearchElement>).length > 0)
						crawlerResult.clicks_search_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_CLICKS_VALID;
					else 
						crawlerResult.clicks_search_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_CLICKS_NOT_VALID;
																								
				} catch (e) {																						
					exceptions.push({
						domain,
						type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
						context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.CLICK_SEARCH_INPUT_STEP,
						metadata: JSON.stringify({message: e.message, stack: e.stack }) 
					});
				}			
			}
				
			// Stage 4 - lookup search_endpoint_url
			const possibleSearchEndpointUrl = lookupSearchUrl(crawlerResult.html ? (crawlerResult.html as HtmlOutput).html : '');		
			if(possibleSearchEndpointUrl && possibleSearchEndpointUrl !== null && possibleSearchEndpointUrl.length > 0)
			{
				try {	
	
						let parsed_search_url = parseSearchUrl(possibleSearchEndpointUrl);
						let search_endpoint_url = buildSearchUrl(domain, parsed_search_url);
	
						crawlerResult.searchEndpoint = {
							regexMatch: possibleSearchEndpointUrl,
							url: search_endpoint_url,
							type: Consts.REFERRERS_CRAWLER.SEARCH_ENDPOINT_TYPES.HTML_BODY_EXTRACTED,
							status: Consts.REFERRERS_CRAWLER.SEARCH_ENDPOINT_STATUS.NOT_SET
						}
						
						const searchEndpointResult = await Tasks.searchBySearchEndpointUrl(browser, search_endpoint_url.replace(Config.ReferrersCrawler.searchEndpointsMatchPattern, Config.ReferrersCrawler.searchQuery), Config.ReferrersCrawler.pageTimeout); 							
	
						if(searchEndpointResult && searchEndpointResult.status && searchEndpointResult.status.error === 0)
						{
							if(crawlerResult.search_clicks && (crawlerResult.search_clicks as Array<ClickSearchElement>).length)						
								(crawlerResult.search_clicks as Array<ClickSearchElement>).push(searchEndpointResult);					
							else
								crawlerResult.search_clicks = [searchEndpointResult];
	
							crawlerResult.search_endpoint_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_URL_ENDPOINT_SUCCESS;						
						}
						else {
							
							crawlerResult.search_endpoint_step_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.SEARCH_URL_ENDPOINT_FAILED;

							exceptions.push({
								domain,
								type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.INFO,
								context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.VISITING_SEARCH_ENDPOINT_STEP,
								metadata: JSON.stringify(searchEndpointResult) 
							});

						}
		
				} catch (e) {
					exceptions.push({
						domain,
						type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
						context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.VISITING_SEARCH_ENDPOINT_STEP,
						metadata: JSON.stringify({message: e.message, stack: e.stack }) 
					});																		
				}
			}
			
			crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.COMPLETED;
		}					
																		
	} catch(e) {	

		crawlerResult.crawl_status = Consts.REFERRERS_CRAWLER.CRAWL_STATUS.ERROR;
		exceptions.push({
			domain,
			type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
			context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.DOMAIN_VISIT_MAIN,
			metadata: JSON.stringify({message: e.message, stack: e.stack }) 
		});
				
	} finally {

		try {
			
			if(browser)
			{
				await BrowserUtils.closeOpenPages(browser);
				await browser.close();
			}

		} catch (e) {
			exceptions.push({
				domain,
				type: Consts.REFERRERS_CRAWLER.AUDIT_TYPES.WORKER_ERROR,
				context: Consts.REFERRERS_CRAWLER.AUDIT_CONTEXT_MESSAGES.CLOSING_BROWSER,
				metadata: JSON.stringify({message: e.message, stack: e.stack }) 
			});			
		}

		crawlerResult.lang_codes = language_codes ? language_codes.toString() : "";
		crawlerResult.url = url;
		crawlerResult.crawl_exceptions = exceptions;
		
		return crawlerResult;	
	}
}

/**
 * worker method exposed to the pool
 * @param {string} inputDomain 
 */
const crawlDomain = async (inputDomain: string) => {

    try {
		
		const startCrawlTime = new Date();

		const worker_data = {
			
			domain: inputDomain,
			url: null,
			lang_codes: null,
			html: null,//html_result: null,
			search_inputs: null,//search_inputs_result: null,
			search_clicks: null,//search_clicks_result: null,
			searchEndpoint: null,

            preFlightHttpCode: -1,
			html_step_status: null,
			lookup_step_status: null,
			clicks_search_step_status: null,				 
			search_endpoint_step_status: null,
            
            crawl_status: Consts.CRAWL_STATUS.NOT_SET,			
			crawl_exceptions: [],		
			start: null,//startCrawlTime: null,
			end: null//endCrawlTime: null
		
			
		} as IWorkerOutput

        try {	

			const { 
				 domain,
				 url,
				 lang_codes,
				 html,
				 search_inputs,
				 search_clicks,
				 searchEndpoint,				 
				 preFlightHttpCode,
				 html_step_status,
				 lookup_step_status,
				 clicks_search_step_status,				 
				 search_endpoint_step_status,
				 crawl_status,
				 crawl_exceptions
			} = await doCrawlingWork(inputDomain);				
								
			worker_data.domain = domain;
			worker_data.url = url;

			worker_data.lang_codes = lang_codes;
			worker_data.html = html;
			worker_data.search_inputs = search_inputs;
			worker_data.search_clicks = search_clicks;
			worker_data.searchEndpoint = searchEndpoint;

			worker_data.preFlightHttpCode = preFlightHttpCode;
			worker_data.html_step_status = html_step_status;
			worker_data.lookup_step_status = lookup_step_status;
			worker_data.clicks_search_step_status = clicks_search_step_status;				 
			worker_data.search_endpoint_step_status = search_endpoint_step_status;

			worker_data.crawl_exceptions = crawl_exceptions;
			worker_data.crawl_status = crawl_status;									
	                                                                
        } catch (e) {
			throw(e);				
		}
		finally {			
			worker_data.start = startCrawlTime,
			worker_data.end = new Date();				 			
			return worker_data;
		}		
                      
    } catch (e) {
        return {
            error: 1,   
            domain: inputDomain || undefined,                
            message: e.message, 
            stack: e.stack
        }
	}	
}

workerpool.worker({
    doWork: crawlDomain
});
