# sync.tcl
#
# This op syncs up the file system with the database.
#
# 1. If page is in both places, do nothing
# 2. If page is only in db, remove it from db
# 3. If page is only in file system, index it

ns_register_proc GET /NS/Syncpages syncpages
ns_register_proc POST /NS/Reallysync reallysync

proc syncpages {conn ignore} {
    set result \
"<HTML><HEAD> <TITLE>Synchronize
database and filesystem</TITLE> </HEAD><BODY><H1>Synchronize database
and filesystem </H1><P>This operation makes sure that the full text of
all pages in the file system under <code>[ns_info pageroot]</code> are
indexed in the database and that there are no indices for non-existent
files."

    if {[ns_config DbServices Archiving]} {
	append result "Files that are not archived
	are archived, and archives for non-existent files are removed."
    }

    append result \
"<P> If you've put lots of files into <code>[ns_info pageroot]</code>
by hand, <H2>This may take a long time, and it will
freeze the server while it works!  </H2><P>
Suggestion: If you don't want your primary server frozen, run a temporary one
on another port for this.
<FORM ACTION=/NS/Reallysync METHOD=POST><INPUT TYPE=submit VALUE=Go></FORM>
</BODY></HTML>" 

    ns_return $conn 200 text/html $result
}

proc reallysync {conn ignore} {

    ns_headers $conn 200 text/html 1000000

    ns_write $conn "<HTML><TITLE>Syncing filesystem and database</TITLE><BODY>"

    ns_write $conn "<HR><BR>Checking Full Text index...<BR><BR>\n"

    cd [ns_info pageroot]
    foreach url [gethtmlfiles .] {
	set url [string range $url 1 end]
	set realurl_array($url) 1
    }

    set db [ns_conn db $conn]

    set row [ns_db select $db "select ns_url from ns_pages where ns_url like '/%';"]
    set indexed_urls {}
    while {[ns_db getrow $db $row]} {
	lappend indexed_urls [ns_set get $row ns_url]
    }

    foreach indexed_url $indexed_urls {
	if  {[info exists realurl_array($indexed_url)]} {
	    # exists in both places; don't do a thing.
	    
	    unset realurl_array($indexed_url)

	} else {
	    ns_write $conn "$indexed_url only in db; deleted<BR>\n"
	    ns_db dml $db "delete from ns_pages where ns_url = '[enquote $indexed_url]';"
	}
    }

    foreach real_url [array names realurl_array] {
	ns_write $conn "$real_url only in file system; indexed<BR>\n"
	ns_indexlocal $db $real_url
    }
    unset realurl_array


    if {[ns_config DbServices Archiving]} {

	ns_write $conn "<HR><BR>Checking Archive...<BR><BR>\n"
	
    	foreach url [getallfiles .] {
	    set url [string range $url 1 end]
	    set realurl_array($url) 1
	}
	
	set row [ns_db select $db "select archive_url from ns_archives;"]
	set archived_urls {}
	while {[ns_db getrow $db $row]} {
	    lappend archived_urls [ns_set get $row archive_url]
	}
	
	foreach archived_url $archived_urls {
	    if  {[info exists realurl_array($archived_url)]} {
		# exists in both places; don't do a thing.
		
		unset realurl_array($archived_url)
		
	    } else {
		ns_write $conn "$archived_url only in db; deleted<BR>\n"
		ns_db dml $db "delete from ns_archives where archive_url = '[enquote $archived_url]';"
	    }
	}
	
	foreach real_url [array names realurl_array] {
	    set url [enquote $real_url]
	    ns_write $conn "$real_url only in file system; archived<BR>\n"
	    ns_db dml $db "insert into ns_archives (archive_url, archive_lo)
	    values ('$url', FileToLO('[ns_info pageroot]/$url'));"
	}
    }
    ns_write $conn "Done.</BODY></HTML>"
}

proc enquote {string} {
    regsub -all "'" $string "''" retval
    return $retval
}

proc gethtmlfiles {dir} {
    getallfiles $dir {\.htm$|\.html$}
}

proc getallfiles {dir {pattern {.*}}} {
    set output ""
    foreach f [glob -nocomplain $dir/*] {
	if [file readable $f] {
	if [file isdirectory $f] {
	    if {[file type $f] != "link"} {
	    	set output [concat $output [getallfiles $f $pattern]]
	    }
	} else {
	    if {[file isfile $f] && [regexp $pattern $f]} {
	    	lappend output $f
	    }
	}
	}
    }
    return $output
}
