xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 220d04022750f40f8bac8f01fa551211e28d04f2)
1 /*
2  * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/khash.h>
35 #include <util/list.h>
36 
37 #include "EndpointManager.h"
38 
39 
40 // References:
41 //  - RFC 793 - Transmission Control Protocol
42 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
43 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44 //
45 // Things this implementation currently doesn't implement:
46 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
47 //	  RFC 2001, RFC 2581, RFC 3042
48 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
49 //	- Explicit Congestion Notification (ECN), RFC 3168
50 //	- SYN-Cache
51 //	- TCP Extensions for High Performance, RFC 1323
52 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
53 //	- Forward RTO-Recovery, RFC 4138
54 //	- Time-Wait hash instead of keeping sockets alive
55 
56 #define PrintAddress(address) \
57 	AddressString(Domain(), address, true).Data()
58 
59 //#define TRACE_TCP
60 //#define PROBE_TCP
61 
62 #ifdef TRACE_TCP
63 // the space before ', ##args' is important in order for this to work with cpp 2.95
64 #	define TRACE(format, args...)	dprintf("%ld: TCP [%llu] %p (%12s) " \
65 		format "\n", find_thread(NULL), system_time(), this, \
66 		name_for_state(fState) , ##args)
67 #else
68 #	define TRACE(args...)			do { } while (0)
69 #endif
70 
71 #ifdef PROBE_TCP
72 #	define PROBE(buffer, window) \
73 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
74 		system_time(), PrintAddress(buffer->source), \
75 		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
76 		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
77 		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
78 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
79 #else
80 #	define PROBE(buffer, window)	do { } while (0)
81 #endif
82 
83 #if TCP_TRACING
84 namespace TCPTracing {
85 
86 class Receive : public AbstractTraceEntry {
87 public:
88 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
89 			net_buffer* buffer)
90 		:
91 		fEndpoint(endpoint),
92 		fBuffer(buffer),
93 		fBufferSize(buffer->size),
94 		fSequence(segment.sequence),
95 		fAcknowledge(segment.acknowledge),
96 		fWindow(window),
97 		fState(endpoint->State()),
98 		fFlags(segment.flags)
99 	{
100 		Initialized();
101 	}
102 
103 	virtual void AddDump(TraceOutput& out)
104 	{
105 		out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, "
106 			"seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState),
107 			fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
108 	}
109 
110 protected:
111 	TCPEndpoint*	fEndpoint;
112 	net_buffer*		fBuffer;
113 	uint32			fBufferSize;
114 	uint32			fSequence;
115 	uint32			fAcknowledge;
116 	uint32			fWindow;
117 	tcp_state		fState;
118 	uint8			fFlags;
119 };
120 
121 class Send : public AbstractTraceEntry {
122 public:
123 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
124 			tcp_sequence firstSequence, tcp_sequence lastSequence)
125 		:
126 		fEndpoint(endpoint),
127 		fBuffer(buffer),
128 		fBufferSize(buffer->size),
129 		fSequence(segment.sequence),
130 		fAcknowledge(segment.acknowledge),
131 		fFirstSequence(firstSequence.Number()),
132 		fLastSequence(lastSequence.Number()),
133 		fState(endpoint->State()),
134 		fFlags(segment.flags)
135 	{
136 		Initialized();
137 	}
138 
139 	virtual void AddDump(TraceOutput& out)
140 	{
141 		out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
142 			"seq %lu, ack %lu, first %lu, last %lu",
143 			fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags,
144 			fSequence, fAcknowledge, fFirstSequence, fLastSequence);
145 	}
146 
147 protected:
148 	TCPEndpoint*	fEndpoint;
149 	net_buffer*		fBuffer;
150 	uint32			fBufferSize;
151 	uint32			fSequence;
152 	uint32			fAcknowledge;
153 	uint32			fFirstSequence;
154 	uint32			fLastSequence;
155 	tcp_state		fState;
156 	uint8			fFlags;
157 };
158 
159 class State : public AbstractTraceEntry {
160 public:
161 	State(TCPEndpoint* endpoint)
162 		:
163 		fEndpoint(endpoint),
164 		fState(endpoint->State())
165 	{
166 		Initialized();
167 	}
168 
169 	virtual void AddDump(TraceOutput& out)
170 	{
171 		out.Print("tcp:%p (%12s) state change", fEndpoint,
172 			name_for_state(fState));
173 	}
174 
175 protected:
176 	TCPEndpoint*	fEndpoint;
177 	tcp_state		fState;
178 };
179 
180 class Spawn : public AbstractTraceEntry {
181 public:
182 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
183 		:
184 		fListeningEndpoint(listeningEndpoint),
185 		fSpawnedEndpoint(spawnedEndpoint)
186 	{
187 		Initialized();
188 	}
189 
190 	virtual void AddDump(TraceOutput& out)
191 	{
192 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
193 	}
194 
195 protected:
196 	TCPEndpoint*	fListeningEndpoint;
197 	TCPEndpoint*	fSpawnedEndpoint;
198 };
199 
200 class Error : public AbstractTraceEntry {
201 public:
202 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
203 		:
204 		fEndpoint(endpoint),
205 		fLine(line),
206 		fError(error),
207 		fState(endpoint->State())
208 	{
209 		Initialized();
210 	}
211 
212 	virtual void AddDump(TraceOutput& out)
213 	{
214 		out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint,
215 			name_for_state(fState), fLine, fError);
216 	}
217 
218 protected:
219 	TCPEndpoint*	fEndpoint;
220 	int32			fLine;
221 	const char*		fError;
222 	tcp_state		fState;
223 };
224 
225 class Timer : public AbstractTraceEntry {
226 public:
227 	Timer(TCPEndpoint* endpoint, const char* which)
228 		:
229 		fEndpoint(endpoint),
230 		fWhich(which),
231 		fState(endpoint->State())
232 	{
233 		Initialized();
234 	}
235 
236 	virtual void AddDump(TraceOutput& out)
237 	{
238 		out.Print("tcp:%p (%12s) %s timer", fEndpoint,
239 			name_for_state(fState), fWhich);
240 	}
241 
242 protected:
243 	TCPEndpoint*	fEndpoint;
244 	const char*		fWhich;
245 	tcp_state		fState;
246 };
247 
248 }	// namespace TCPTracing
249 
250 #	define T(x)	new(std::nothrow) TCPTracing::x
251 #else
252 #	define T(x)
253 #endif	// TCP_TRACING
254 
255 // Initial estimate for packet round trip time (RTT)
256 #define TCP_INITIAL_RTT		2000000
257 
258 // constants for the fFlags field
259 enum {
260 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
261 	FLAG_OPTION_TIMESTAMP		= 0x02,
262 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
263 	//       That is, what is expected from accept() after a shutdown()
264 	//       is performed on a listen()ing socket.
265 	FLAG_NO_RECEIVE				= 0x04,
266 	FLAG_CLOSED					= 0x08,
267 	FLAG_DELETE_ON_CLOSE		= 0x10,
268 	FLAG_LOCAL					= 0x20
269 };
270 
271 
272 static const int kTimestampFactor = 1024;
273 
274 
275 static inline bigtime_t
276 absolute_timeout(bigtime_t timeout)
277 {
278 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
279 		return timeout;
280 
281 	return timeout + system_time();
282 }
283 
284 
285 static inline status_t
286 posix_error(status_t error)
287 {
288 	if (error == B_TIMED_OUT)
289 		return B_WOULD_BLOCK;
290 
291 	return error;
292 }
293 
294 
295 static inline bool
296 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
297 	uint32 receiveWindow)
298 {
299 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
300 }
301 
302 
303 static inline bool
304 segment_in_sequence(const tcp_segment_header& segment, int size,
305 	const tcp_sequence& receiveNext, uint32 receiveWindow)
306 {
307 	tcp_sequence sequence(segment.sequence);
308 	if (size == 0) {
309 		if (receiveWindow == 0)
310 			return sequence == receiveNext;
311 		return in_window(sequence, receiveNext, receiveWindow);
312 	} else {
313 		if (receiveWindow == 0)
314 			return false;
315 		return in_window(sequence, receiveNext, receiveWindow)
316 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
317 	}
318 }
319 
320 
321 static inline bool
322 is_writable(tcp_state state)
323 {
324 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
325 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
326 }
327 
328 
329 static inline uint32 tcp_now()
330 {
331 	return system_time() / kTimestampFactor;
332 }
333 
334 
335 static inline uint32 tcp_diff_timestamp(uint32 base)
336 {
337 	uint32 now = tcp_now();
338 
339 	if (now > base)
340 		return now - base;
341 
342 	return now + UINT_MAX - base;
343 }
344 
345 
346 static inline bool
347 state_needs_finish(int32 state)
348 {
349 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
350 		|| state == FINISH_SENT || state == CLOSING;
351 }
352 
353 
354 //	#pragma mark -
355 
356 
357 WaitList::WaitList(const char* name)
358 {
359 	fCondition = 0;
360 	fSem = create_sem(0, name);
361 }
362 
363 
364 WaitList::~WaitList()
365 {
366 	delete_sem(fSem);
367 }
368 
369 
370 status_t
371 WaitList::InitCheck() const
372 {
373 	return fSem;
374 }
375 
376 
377 status_t
378 WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
379 {
380 	locker.Unlock();
381 
382 	status_t status = B_OK;
383 
384 	while (!atomic_test_and_set(&fCondition, 0, 1)) {
385 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
386 			timeout);
387 		if (status != B_OK)
388 			break;
389 	}
390 
391 	locker.Lock();
392 	return status;
393 }
394 
395 
396 void
397 WaitList::Signal()
398 {
399 	atomic_or(&fCondition, 1);
400 #ifdef __HAIKU__
401 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
402 #else
403 	int32 count;
404 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
405 		release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
406 #endif
407 }
408 
409 
410 //	#pragma mark -
411 
412 
413 TCPEndpoint::TCPEndpoint(net_socket* socket)
414 	:
415 	ProtocolSocket(socket),
416 	fManager(NULL),
417 	fReceiveList("tcp receive"),
418 	fSendList("tcp send"),
419 	fOptions(0),
420 	fSendWindowShift(0),
421 	fReceiveWindowShift(0),
422 	fSendUnacknowledged(0),
423 	fSendNext(0),
424 	fSendMax(0),
425 	fSendUrgentOffset(0),
426 	fSendWindow(0),
427 	fSendMaxWindow(0),
428 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
429 	fSendQueue(socket->send.buffer_size),
430 	fInitialSendSequence(0),
431 	fDuplicateAcknowledgeCount(0),
432 	fRoute(NULL),
433 	fReceiveNext(0),
434 	fReceiveMaxAdvertised(0),
435 	fReceiveWindow(socket->receive.buffer_size),
436 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
437 	fReceiveQueue(socket->receive.buffer_size),
438 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
439 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
440 	fRetransmitTimeout(TCP_INITIAL_RTT),
441 	fReceivedTimestamp(0),
442 	fCongestionWindow(0),
443 	fSlowStartThreshold(0),
444 	fState(CLOSED),
445 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
446 {
447 	// TODO: to be replaced with a real read/write locking strategy!
448 	mutex_init(&fLock, "tcp lock");
449 
450 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
451 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
452 		this);
453 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
454 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
455 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
456 		this);
457 }
458 
459 
460 TCPEndpoint::~TCPEndpoint()
461 {
462 	mutex_lock(&fLock);
463 
464 	_CancelConnectionTimers();
465 	gStackModule->cancel_timer(&fTimeWaitTimer);
466 
467 	if (fManager != NULL) {
468 		fManager->Unbind(this);
469 		put_endpoint_manager(fManager);
470 	}
471 
472 	mutex_destroy(&fLock);
473 
474 	// we need to wait for all timers to return
475 	gStackModule->wait_for_timer(&fRetransmitTimer);
476 	gStackModule->wait_for_timer(&fPersistTimer);
477 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
478 	gStackModule->wait_for_timer(&fTimeWaitTimer);
479 
480 	gDatalinkModule->put_route(Domain(), fRoute);
481 }
482 
483 
484 status_t
485 TCPEndpoint::InitCheck() const
486 {
487 	if (fReceiveList.InitCheck() < B_OK)
488 		return fReceiveList.InitCheck();
489 
490 	if (fSendList.InitCheck() < B_OK)
491 		return fSendList.InitCheck();
492 
493 	return B_OK;
494 }
495 
496 
497 //	#pragma mark - protocol API
498 
499 
500 status_t
501 TCPEndpoint::Open()
502 {
503 	TRACE("Open()");
504 
505 	status_t status = ProtocolSocket::Open();
506 	if (status < B_OK)
507 		return status;
508 
509 	fManager = get_endpoint_manager(Domain());
510 	if (fManager == NULL)
511 		return EAFNOSUPPORT;
512 
513 	return B_OK;
514 }
515 
516 
517 status_t
518 TCPEndpoint::Close()
519 {
520 	TRACE("Close()");
521 
522 	MutexLocker locker(fLock);
523 
524 	if (fState == LISTEN)
525 		delete_sem(fAcceptSemaphore);
526 
527 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
528 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
529 		fState = CLOSED;
530 		T(State(this));
531 		return B_OK;
532 	}
533 
534 	status_t status = _Disconnect(true);
535 	if (status != B_OK)
536 		return status;
537 
538 	if (socket->options & SO_LINGER) {
539 		TRACE("Close(): Lingering for %i secs", socket->linger);
540 
541 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
542 
543 		while (fSendQueue.Used() > 0) {
544 			status = fSendList.Wait(locker, maximum);
545 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
546 				break;
547 			else if (status < B_OK)
548 				return status;
549 		}
550 
551 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
552 			fSendQueue.Used());
553 	}
554 	return B_OK;
555 }
556 
557 
558 void
559 TCPEndpoint::Free()
560 {
561 	TRACE("Free()");
562 
563 	MutexLocker _(fLock);
564 
565 	if (fState <= SYNCHRONIZE_SENT)
566 		return;
567 
568 	// we are only interested in the timer, not in changing state
569 	_EnterTimeWait();
570 
571 	fFlags |= FLAG_CLOSED;
572 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
573 		// we'll be freed later when the 2MSL timer expires
574 		gSocketModule->acquire_socket(socket);
575 	}
576 }
577 
578 
579 /*!	Creates and sends a synchronize packet to /a address, and then waits
580 	until the connection has been established or refused.
581 */
582 status_t
583 TCPEndpoint::Connect(const sockaddr* address)
584 {
585 	TRACE("Connect() on address %s", PrintAddress(address));
586 
587 	if (!AddressModule()->is_same_family(address))
588 		return EAFNOSUPPORT;
589 
590 	MutexLocker locker(fLock);
591 
592 	if (gStackModule->is_restarted_syscall()) {
593 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
594 		status_t status = _WaitForEstablished(locker, timeout);
595 		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
596 			strerror(status), timeout);
597 		return posix_error(status);
598 	}
599 
600 	// Can only call connect() from CLOSED or LISTEN states
601 	// otherwise endpoint is considered already connected
602 	if (fState == LISTEN) {
603 		// this socket is about to connect; remove pending connections in the backlog
604 		gSocketModule->set_max_backlog(socket, 0);
605 	} else if (fState == ESTABLISHED) {
606 		return EISCONN;
607 	} else if (fState != CLOSED)
608 		return EINPROGRESS;
609 
610 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
611 	sockaddr_storage _address;
612 	if (AddressModule()->is_empty_address(address, false)) {
613 		AddressModule()->get_loopback_address((sockaddr *)&_address);
614 		// for IPv4 and IPv6 the port is at the same offset
615 		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
616 		address = (sockaddr *)&_address;
617 	}
618 
619 	status_t status = _PrepareSendPath(address);
620 	if (status < B_OK)
621 		return status;
622 
623 	TRACE("  Connect(): starting 3-way handshake...");
624 
625 	fState = SYNCHRONIZE_SENT;
626 	T(State(this));
627 
628 	// send SYN
629 	status = _SendQueued();
630 	if (status != B_OK) {
631 		_Close();
632 		return status;
633 	}
634 
635 	// If we are running over Loopback, after _SendQueued() returns we
636 	// may be in ESTABLISHED already.
637 	if (fState == ESTABLISHED) {
638 		TRACE("  Connect() completed after _SendQueued()");
639 		return B_OK;
640 	}
641 
642 	// wait until 3-way handshake is complete (if needed)
643 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
644 	if (timeout == 0) {
645 		// we're a non-blocking socket
646 		TRACE("  Connect() delayed, return EINPROGRESS");
647 		return EINPROGRESS;
648 	}
649 
650 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
651 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
652 
653 	status = _WaitForEstablished(locker, absoluteTimeout);
654 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
655 		strerror(status), timeout);
656 	return posix_error(status);
657 }
658 
659 
660 status_t
661 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
662 {
663 	TRACE("Accept()");
664 
665 	MutexLocker locker(fLock);
666 
667 	status_t status;
668 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
669 	if (gStackModule->is_restarted_syscall())
670 		timeout = gStackModule->restore_syscall_restart_timeout();
671 	else
672 		gStackModule->store_syscall_restart_timeout(timeout);
673 
674 	do {
675 		locker.Unlock();
676 
677 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
678 			| B_CAN_INTERRUPT, timeout);
679 		if (status != B_OK) {
680 			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
681 				return B_WOULD_BLOCK;
682 
683 			return status;
684 		}
685 
686 		locker.Lock();
687 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
688 #ifdef TRACE_TCP
689 		if (status == B_OK)
690 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
691 #endif
692 	} while (status != B_OK);
693 
694 	return status;
695 }
696 
697 
698 status_t
699 TCPEndpoint::Bind(const sockaddr *address)
700 {
701 	if (address == NULL)
702 		return B_BAD_VALUE;
703 
704 	MutexLocker lock(fLock);
705 
706 	TRACE("Bind() on address %s", PrintAddress(address));
707 
708 	if (fState != CLOSED)
709 		return EISCONN;
710 
711 	return fManager->Bind(this, address);
712 }
713 
714 
715 status_t
716 TCPEndpoint::Unbind(struct sockaddr *address)
717 {
718 	TRACE("Unbind()");
719 
720 	MutexLocker _(fLock);
721 	return fManager->Unbind(this);
722 }
723 
724 
725 status_t
726 TCPEndpoint::Listen(int count)
727 {
728 	TRACE("Listen()");
729 
730 	MutexLocker _(fLock);
731 
732 	if (fState != CLOSED)
733 		return B_BAD_VALUE;
734 
735 	fAcceptSemaphore = create_sem(0, "tcp accept");
736 	if (fAcceptSemaphore < B_OK)
737 		return ENOBUFS;
738 
739 	status_t status = fManager->SetPassive(this);
740 	if (status != B_OK) {
741 		delete_sem(fAcceptSemaphore);
742 		fAcceptSemaphore = -1;
743 		return status;
744 	}
745 
746 	gSocketModule->set_max_backlog(socket, count);
747 
748 	fState = LISTEN;
749 	T(State(this));
750 	return B_OK;
751 }
752 
753 
754 status_t
755 TCPEndpoint::Shutdown(int direction)
756 {
757 	TRACE("Shutdown(%i)", direction);
758 
759 	MutexLocker lock(fLock);
760 
761 	if (direction == SHUT_RD || direction == SHUT_RDWR)
762 		fFlags |= FLAG_NO_RECEIVE;
763 
764 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
765 		// TODO: That's not correct. After read/write shutting down the socket
766 		// one should still be able to read previously arrived data.
767 		_Disconnect(false);
768 	}
769 
770 	return B_OK;
771 }
772 
773 
774 /*!	Puts data contained in \a buffer into send buffer */
775 status_t
776 TCPEndpoint::SendData(net_buffer *buffer)
777 {
778 	MutexLocker lock(fLock);
779 
780 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
781 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
782 		  fSendQueue.Free());
783 
784 	if (fState == CLOSED)
785 		return ENOTCONN;
786 	if (fState == LISTEN)
787 		return EDESTADDRREQ;
788 	if (!is_writable(fState)) {
789 		// we only send signals when called from userland
790 		if (gStackModule->is_syscall())
791 			send_signal(find_thread(NULL), SIGPIPE);
792 		return EPIPE;
793 	}
794 
795 	uint32 flags = buffer->flags;
796 	size_t left = buffer->size;
797 
798 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
799 	if (gStackModule->is_restarted_syscall())
800 		timeout = gStackModule->restore_syscall_restart_timeout();
801 	else
802 		gStackModule->store_syscall_restart_timeout(timeout);
803 
804 	while (left > 0) {
805 		while (fSendQueue.Free() < socket->send.low_water_mark) {
806 			// wait until enough space is available
807 			status_t status = fSendList.Wait(lock, timeout);
808 			if (status < B_OK) {
809 				TRACE("  SendData() returning %s (%d)",
810 					strerror(posix_error(status)), (int)posix_error(status));
811 				return posix_error(status);
812 			}
813 
814 			if (!is_writable(fState)) {
815 				// we only send signals when called from userland
816 				if (gStackModule->is_syscall())
817 					send_signal(find_thread(NULL), SIGPIPE);
818 				return EPIPE;
819 			}
820 		}
821 
822 		size_t size = fSendQueue.Free();
823 		if (size < left) {
824 			// we need to split the original buffer
825 			net_buffer* clone = gBufferModule->clone(buffer, false);
826 				// TODO: add offset/size parameter to net_buffer::clone() or
827 				// even a move_data() function, as this is a bit inefficient
828 			if (clone == NULL)
829 				return ENOBUFS;
830 
831 			status_t status = gBufferModule->trim(clone, size);
832 			if (status != B_OK) {
833 				gBufferModule->free(clone);
834 				return status;
835 			}
836 
837 			gBufferModule->remove_header(buffer, size);
838 			left -= size;
839 			fSendQueue.Add(clone);
840 		} else {
841 			left -= buffer->size;
842 			fSendQueue.Add(buffer);
843 		}
844 	}
845 
846 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
847 
848 	bool force = false;
849 	if ((flags & MSG_OOB) != 0) {
850 		fSendUrgentOffset = fSendQueue.LastSequence();
851 			// RFC 961 specifies that the urgent offset points to the last
852 			// byte of urgent data. However, this is commonly implemented as
853 			// here, ie. it points to the first byte after the urgent data.
854 		force = true;
855 	}
856 	if ((flags & MSG_EOF) != 0)
857 		_Disconnect(false);
858 
859 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
860 		_SendQueued(force);
861 
862 	return B_OK;
863 }
864 
865 
866 ssize_t
867 TCPEndpoint::SendAvailable()
868 {
869 	MutexLocker locker(fLock);
870 
871 	ssize_t available;
872 
873 	if (is_writable(fState))
874 		available = fSendQueue.Free();
875 	else
876 		available = EPIPE;
877 
878 	TRACE("SendAvailable(): %li", available);
879 	return available;
880 }
881 
882 
883 status_t
884 TCPEndpoint::FillStat(net_stat *stat)
885 {
886 	MutexLocker _(fLock);
887 
888 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
889 	stat->receive_queue_size = fReceiveQueue.Available();
890 	stat->send_queue_size = fSendQueue.Used();
891 
892 	return B_OK;
893 }
894 
895 
896 status_t
897 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
898 {
899 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
900 
901 	MutexLocker locker(fLock);
902 
903 	*_buffer = NULL;
904 
905 	if (fState == CLOSED)
906 		return ENOTCONN;
907 
908 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
909 	if (gStackModule->is_restarted_syscall())
910 		timeout = gStackModule->restore_syscall_restart_timeout();
911 	else
912 		gStackModule->store_syscall_restart_timeout(timeout);
913 
914 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
915 		if (flags & MSG_DONTWAIT)
916 			return B_WOULD_BLOCK;
917 
918 		status_t status = _WaitForEstablished(locker, timeout);
919 		if (status < B_OK)
920 			return posix_error(status);
921 	}
922 
923 	size_t dataNeeded = socket->receive.low_water_mark;
924 
925 	// When MSG_WAITALL is set then the function should block
926 	// until the full amount of data can be returned.
927 	if (flags & MSG_WAITALL)
928 		dataNeeded = numBytes;
929 
930 	// TODO: add support for urgent data (MSG_OOB)
931 
932 	while (true) {
933 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
934 			|| fState == TIME_WAIT) {
935 			// ``Connection closing''.
936 			return B_OK;
937 		}
938 
939 		if (fReceiveQueue.Available() > 0) {
940 			if (fReceiveQueue.Available() >= dataNeeded
941 				|| (fReceiveQueue.PushedData() > 0
942 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
943 				break;
944 		} else if (fState == FINISH_RECEIVED) {
945 			// ``If no text is awaiting delivery, the RECEIVE will
946 			//   get a Connection closing''.
947 			return B_OK;
948 		}
949 
950 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
951 			return B_WOULD_BLOCK;
952 
953 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
954 			return B_OK;
955 
956 		status_t status = fReceiveList.Wait(locker, timeout);
957 		if (status < B_OK) {
958 			// The Open Group base specification mentions that EINTR should be
959 			// returned if the recv() is interrupted before _any data_ is
960 			// available. So we actually check if there is data, and if so,
961 			// push it to the user.
962 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
963 				&& fReceiveQueue.Available() > 0)
964 				break;
965 
966 			return posix_error(status);
967 		}
968 	}
969 
970 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
971 
972 	if (numBytes < fReceiveQueue.Available())
973 		fReceiveList.Signal();
974 
975 	bool clone = (flags & MSG_PEEK) != 0;
976 
977 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
978 
979 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
980 
981 	// if we are opening the window, check if we should send an ACK
982 	if (!clone)
983 		SendAcknowledge(false);
984 
985 	return receivedBytes;
986 }
987 
988 
989 ssize_t
990 TCPEndpoint::ReadAvailable()
991 {
992 	MutexLocker locker(fLock);
993 
994 	TRACE("ReadAvailable(): %li", _AvailableData());
995 
996 	return _AvailableData();
997 }
998 
999 
1000 status_t
1001 TCPEndpoint::SetSendBufferSize(size_t length)
1002 {
1003 	MutexLocker _(fLock);
1004 	fSendQueue.SetMaxBytes(length);
1005 	return B_OK;
1006 }
1007 
1008 
1009 status_t
1010 TCPEndpoint::SetReceiveBufferSize(size_t length)
1011 {
1012 	MutexLocker _(fLock);
1013 	fReceiveQueue.SetMaxBytes(length);
1014 	return B_OK;
1015 }
1016 
1017 
1018 status_t
1019 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1020 {
1021 	if (*_length != sizeof(int))
1022 		return B_BAD_VALUE;
1023 
1024 	int* value = (int*)_value;
1025 
1026 	switch (option) {
1027 		case TCP_NODELAY:
1028 			if ((fOptions & TCP_NODELAY) != 0)
1029 				*value = 1;
1030 			else
1031 				*value = 0;
1032 			return B_OK;
1033 
1034 		case TCP_MAXSEG:
1035 			*value = fReceiveMaxSegmentSize;
1036 			return B_OK;
1037 
1038 		default:
1039 			return B_BAD_VALUE;
1040 	}
1041 }
1042 
1043 
1044 status_t
1045 TCPEndpoint::SetOption(int option, const void* _value, int length)
1046 {
1047 	if (option != TCP_NODELAY)
1048 		return B_BAD_VALUE;
1049 
1050 	if (length != sizeof(int))
1051 		return B_BAD_VALUE;
1052 
1053 	const int* value = (const int*)_value;
1054 
1055 	MutexLocker _(fLock);
1056 	if (*value)
1057 		fOptions |= TCP_NODELAY;
1058 	else
1059 		fOptions &= ~TCP_NODELAY;
1060 
1061 	return B_OK;
1062 }
1063 
1064 
1065 //	#pragma mark - misc
1066 
1067 
1068 bool
1069 TCPEndpoint::IsBound() const
1070 {
1071 	return !LocalAddress().IsEmpty(true);
1072 }
1073 
1074 
1075 bool
1076 TCPEndpoint::IsLocal() const
1077 {
1078 	return (fFlags & FLAG_LOCAL) != 0;
1079 }
1080 
1081 
1082 status_t
1083 TCPEndpoint::DelayedAcknowledge()
1084 {
1085 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1086 		// timer was active, send an ACK now (with the exception above,
1087 		// we send every other ACK)
1088 		return SendAcknowledge(true);
1089 	}
1090 
1091 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1092 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1093 	return B_OK;
1094 }
1095 
1096 
1097 status_t
1098 TCPEndpoint::SendAcknowledge(bool force)
1099 {
1100 	return _SendQueued(force, 0);
1101 }
1102 
1103 
1104 void
1105 TCPEndpoint::_StartPersistTimer()
1106 {
1107 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
1108 }
1109 
1110 
1111 void
1112 TCPEndpoint::_EnterTimeWait()
1113 {
1114 	TRACE("_EnterTimeWait()\n");
1115 
1116 	_CancelConnectionTimers();
1117 
1118 	if (fState == TIME_WAIT && IsLocal()) {
1119 		// we do not use TIME_WAIT state for local connections
1120 		fFlags |= FLAG_DELETE_ON_CLOSE;
1121 		return;
1122 	}
1123 
1124 	_UpdateTimeWait();
1125 }
1126 
1127 
1128 void
1129 TCPEndpoint::_UpdateTimeWait()
1130 {
1131 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1132 }
1133 
1134 
1135 void
1136 TCPEndpoint::_CancelConnectionTimers()
1137 {
1138 	gStackModule->cancel_timer(&fRetransmitTimer);
1139 	gStackModule->cancel_timer(&fPersistTimer);
1140 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1141 }
1142 
1143 
1144 /*!	Sends the FIN flag to the peer when the connection is still open.
1145 	Moves the endpoint to the next state depending on where it was.
1146 */
1147 status_t
1148 TCPEndpoint::_Disconnect(bool closing)
1149 {
1150 	tcp_state previousState = fState;
1151 
1152 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1153 		fState = FINISH_SENT;
1154 	else if (fState == FINISH_RECEIVED)
1155 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1156 	else
1157 		return B_OK;
1158 
1159 	T(State(this));
1160 
1161 	status_t status = _SendQueued();
1162 	if (status != B_OK) {
1163 		fState = previousState;
1164 		T(State(this));
1165 		return status;
1166 	}
1167 
1168 	return B_OK;
1169 }
1170 
1171 
1172 void
1173 TCPEndpoint::_MarkEstablished()
1174 {
1175 	fState = ESTABLISHED;
1176 	T(State(this));
1177 
1178 	if (gSocketModule->has_parent(socket)) {
1179 		gSocketModule->set_connected(socket);
1180 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1181 	}
1182 
1183 	fSendList.Signal();
1184 }
1185 
1186 
1187 status_t
1188 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1189 {
1190 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1191 	// When investigating this, also have a look at _Close() and _HandleReset().
1192 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1193 		if (socket->error != B_OK)
1194 			return socket->error;
1195 
1196 		status_t status = fSendList.Wait(locker, timeout);
1197 		if (status < B_OK)
1198 			return status;
1199 	}
1200 
1201 	return B_OK;
1202 }
1203 
1204 
1205 //	#pragma mark - receive
1206 
1207 
1208 void
1209 TCPEndpoint::_Close()
1210 {
1211 	_CancelConnectionTimers();
1212 	fState = CLOSED;
1213 	T(State(this));
1214 
1215 	fFlags |= FLAG_DELETE_ON_CLOSE;
1216 
1217 	fSendList.Signal();
1218 	_NotifyReader();
1219 
1220 	if (gSocketModule->has_parent(socket)) {
1221 		// We still have a parent - obviously, we haven't been accepted yet,
1222 		// so no one could ever close us.
1223 		_CancelConnectionTimers();
1224 		gSocketModule->set_aborted(socket);
1225 	}
1226 }
1227 
1228 
1229 void
1230 TCPEndpoint::_HandleReset(status_t error)
1231 {
1232 	socket->error = error;
1233 	_Close();
1234 
1235 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1236 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1237 }
1238 
1239 
1240 void
1241 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1242 {
1243 	if (++fDuplicateAcknowledgeCount < 3)
1244 		return;
1245 
1246 	if (fDuplicateAcknowledgeCount == 3) {
1247 		_ResetSlowStart();
1248 		fCongestionWindow = fSlowStartThreshold + 3
1249 			* fSendMaxSegmentSize;
1250 		fSendNext = segment.acknowledge;
1251 	} else if (fDuplicateAcknowledgeCount > 3)
1252 		fCongestionWindow += fSendMaxSegmentSize;
1253 
1254 	_SendQueued();
1255 }
1256 
1257 
1258 void
1259 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1260 	size_t segmentLength)
1261 {
1262 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1263 		tcp_sequence sequence(segment.sequence);
1264 
1265 		if (fLastAcknowledgeSent >= sequence
1266 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1267 			fReceivedTimestamp = segment.timestamp_value;
1268 	}
1269 }
1270 
1271 
1272 ssize_t
1273 TCPEndpoint::_AvailableData() const
1274 {
1275 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1276 	//       the application of FLAG_NO_RECEIVE in listen()ing
1277 	//       sockets.
1278 	if (fState == LISTEN)
1279 		return gSocketModule->count_connected(socket);
1280 	if (fState == SYNCHRONIZE_SENT)
1281 		return 0;
1282 
1283 	ssize_t availableData = fReceiveQueue.Available();
1284 
1285 	if (availableData == 0 && !_ShouldReceive())
1286 		return ENOTCONN;
1287 
1288 	return availableData;
1289 }
1290 
1291 
1292 void
1293 TCPEndpoint::_NotifyReader()
1294 {
1295 	fReceiveList.Signal();
1296 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1297 }
1298 
1299 
1300 bool
1301 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1302 {
1303 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1304 		// Remember the position of the finish received flag
1305 		fFinishReceived = true;
1306 		fFinishReceivedAt = segment.sequence + buffer->size;
1307 	}
1308 
1309 	fReceiveQueue.Add(buffer, segment.sequence);
1310 	fReceiveNext = fReceiveQueue.NextSequence();
1311 
1312 	if (fFinishReceived) {
1313 		// Set or reset the finish flag on the current segment
1314 		if (fReceiveNext < fFinishReceivedAt)
1315 			segment.flags &= ~TCP_FLAG_FINISH;
1316 		else
1317 			segment.flags |= TCP_FLAG_FINISH;
1318 	}
1319 
1320 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1321 		fReceiveNext.Number(), fReceiveQueue.Available());
1322 
1323 	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1324 		fReceiveQueue.SetPushPointer();
1325 
1326 	return fReceiveQueue.Available() > 0;
1327 }
1328 
1329 
1330 void
1331 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1332 {
1333 	fInitialReceiveSequence = segment.sequence;
1334 	fFinishReceived = false;
1335 
1336 	// count the received SYN
1337 	segment.sequence++;
1338 
1339 	fReceiveNext = segment.sequence;
1340 	fReceiveQueue.SetInitialSequence(segment.sequence);
1341 
1342 	if ((fOptions & TCP_NOOPT) == 0) {
1343 		if (segment.max_segment_size > 0)
1344 			fSendMaxSegmentSize = segment.max_segment_size;
1345 
1346 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1347 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1348 			fSendWindowShift = segment.window_shift;
1349 		} else {
1350 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1351 			fReceiveWindowShift = 0;
1352 		}
1353 
1354 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1355 			fFlags |= FLAG_OPTION_TIMESTAMP;
1356 			fReceivedTimestamp = segment.timestamp_value;
1357 		} else
1358 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1359 	}
1360 
1361 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1362 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1363 }
1364 
1365 
1366 bool
1367 TCPEndpoint::_ShouldReceive() const
1368 {
1369 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1370 		return false;
1371 
1372 	return fState == ESTABLISHED || fState == FINISH_SENT
1373 		|| fState == FINISH_ACKNOWLEDGED;
1374 }
1375 
1376 
1377 int32
1378 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1379 	net_buffer* buffer)
1380 {
1381 	MutexLocker _(fLock);
1382 
1383 	// TODO error checking
1384 	ProtocolSocket::Open();
1385 
1386 	fState = SYNCHRONIZE_RECEIVED;
1387 	T(Spawn(parent, this));
1388 
1389 	fManager = parent->fManager;
1390 
1391 	LocalAddress().SetTo(buffer->destination);
1392 	PeerAddress().SetTo(buffer->source);
1393 
1394 	TRACE("Spawn()");
1395 
1396 	// TODO: proper error handling!
1397 	if (fManager->BindChild(this) != B_OK) {
1398 		T(Error(this, "binding failed", __LINE__));
1399 		return DROP;
1400 	}
1401 	if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1402 		T(Error(this, "prepare send faild", __LINE__));
1403 		return DROP;
1404 	}
1405 
1406 	fOptions = parent->fOptions;
1407 	fAcceptSemaphore = parent->fAcceptSemaphore;
1408 
1409 	_PrepareReceivePath(segment);
1410 
1411 	// send SYN+ACK
1412 	if (_SendQueued() != B_OK) {
1413 		T(Error(this, "sending failed", __LINE__));
1414 		return DROP;
1415 	}
1416 
1417 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1418 		// we handled this flag now, it must not be set for further processing
1419 
1420 	return _Receive(segment, buffer);
1421 }
1422 
1423 
1424 int32
1425 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1426 {
1427 	TRACE("ListenReceive()");
1428 
1429 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1430 	// but the error behaviour differs
1431 	if (segment.flags & TCP_FLAG_RESET)
1432 		return DROP;
1433 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1434 		return DROP | RESET;
1435 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1436 		return DROP;
1437 
1438 	// TODO: drop broadcast/multicast
1439 
1440 	// spawn new endpoint for accept()
1441 	net_socket* newSocket;
1442 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1443 		T(Error(this, "spawning failed", __LINE__));
1444 		return DROP;
1445 	}
1446 
1447 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1448 		segment, buffer);
1449 }
1450 
1451 
1452 int32
1453 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1454 	net_buffer *buffer)
1455 {
1456 	TRACE("_SynchronizeSentReceive()");
1457 
1458 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1459 		&& (fInitialSendSequence >= segment.acknowledge
1460 			|| fSendMax < segment.acknowledge))
1461 		return DROP | RESET;
1462 
1463 	if (segment.flags & TCP_FLAG_RESET) {
1464 		_HandleReset(ECONNREFUSED);
1465 		return DROP;
1466 	}
1467 
1468 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1469 		return DROP;
1470 
1471 	fSendUnacknowledged = segment.acknowledge;
1472 	_PrepareReceivePath(segment);
1473 
1474 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1475 		_MarkEstablished();
1476 	} else {
1477 		// simultaneous open
1478 		fState = SYNCHRONIZE_RECEIVED;
1479 		T(State(this));
1480 	}
1481 
1482 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1483 		// we handled this flag now, it must not be set for further processing
1484 
1485 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1486 }
1487 
1488 
1489 int32
1490 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1491 {
1492 	uint32 advertisedWindow = (uint32)segment.advertised_window
1493 		<< fSendWindowShift;
1494 	size_t segmentLength = buffer->size;
1495 
1496 	// First, handle the most common case for uni-directional data transfer
1497 	// (known as header prediction - the segment must not change the window,
1498 	// and must be the expected sequence, and contain no control flags)
1499 
1500 	if (fState == ESTABLISHED
1501 		&& segment.AcknowledgeOnly()
1502 		&& fReceiveNext == segment.sequence
1503 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1504 		&& fSendNext == fSendMax) {
1505 		_UpdateTimestamps(segment, segmentLength);
1506 
1507 		if (segmentLength == 0) {
1508 			// this is a pure acknowledge segment - we're on the sending end
1509 			if (fSendUnacknowledged < segment.acknowledge
1510 				&& fSendMax >= segment.acknowledge) {
1511 				_Acknowledged(segment);
1512 				return DROP;
1513 			}
1514 		} else if (segment.acknowledge == fSendUnacknowledged
1515 			&& fReceiveQueue.IsContiguous()
1516 			&& fReceiveQueue.Free() >= segmentLength
1517 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1518 			if (_AddData(segment, buffer))
1519 				_NotifyReader();
1520 
1521 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1522 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1523 		}
1524 	}
1525 
1526 	// The fast path was not applicable, so we continue with the standard
1527 	// processing of the incoming segment
1528 
1529 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1530 
1531 	if (fState != CLOSED && fState != TIME_WAIT) {
1532 		// Check sequence number
1533 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1534 				fReceiveWindow)) {
1535 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1536 				fReceiveNext.Number(), fReceiveWindow);
1537 			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1538 				// TODO: this doesn't look right - review!
1539 				return DROP;
1540 			}
1541 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1542 		}
1543 	}
1544 
1545 	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1546 		// Is this a valid reset?
1547 		// We generally ignore resets in time wait state (see RFC 1337)
1548 		if (fLastAcknowledgeSent <= segment.sequence
1549 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1550 				+ fReceiveWindow)
1551 			&& fState != TIME_WAIT) {
1552 			status_t error;
1553 			if (fState == SYNCHRONIZE_RECEIVED)
1554 				error = ECONNREFUSED;
1555 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1556 				error = ENOTCONN;
1557 			else
1558 				error = ECONNRESET;
1559 
1560 			_HandleReset(error);
1561 		}
1562 
1563 		return DROP;
1564 	}
1565 
1566 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1567 		|| (fState == SYNCHRONIZE_RECEIVED
1568 			&& (fInitialReceiveSequence > segment.sequence
1569 				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1570 					&& (fSendUnacknowledged > segment.acknowledge
1571 						|| fSendMax < segment.acknowledge))))) {
1572 		// reset the connection - either the initial SYN was faulty, or we
1573 		// received a SYN within the data stream
1574 		return DROP | RESET;
1575 	}
1576 
1577 	// TODO: Check this! Why do we advertize a window outside of what we should
1578 	// buffer?
1579 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1580 		// the window must not shrink
1581 
1582 	// trim buffer to be within the receive window
1583 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1584 	if (drop > 0) {
1585 		if ((uint32)drop > buffer->size
1586 			|| ((uint32)drop == buffer->size
1587 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1588 			// don't accidently remove a FIN we shouldn't remove
1589 			segment.flags &= ~TCP_FLAG_FINISH;
1590 			drop = buffer->size;
1591 		}
1592 
1593 		// remove duplicate data at the start
1594 		TRACE("* remove %ld bytes from the start", drop);
1595 		gBufferModule->remove_header(buffer, drop);
1596 		segment.sequence += drop;
1597 	}
1598 
1599 	int32 action = KEEP;
1600 
1601 	drop = (int32)(segment.sequence + buffer->size
1602 		- (fReceiveNext + fReceiveWindow)).Number();
1603 	if (drop > 0) {
1604 		// remove data exceeding our window
1605 		if ((uint32)drop >= buffer->size) {
1606 			// if we can accept data, or the segment is not what we'd expect,
1607 			// drop the segment (an immediate acknowledge is always triggered)
1608 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1609 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1610 
1611 			action |= IMMEDIATE_ACKNOWLEDGE;
1612 		}
1613 
1614 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1615 			// we need to remove the finish, too, as part of the data
1616 			drop--;
1617 		}
1618 
1619 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1620 		TRACE("* remove %ld bytes from the end", drop);
1621 		gBufferModule->remove_trailer(buffer, drop);
1622 	}
1623 
1624 #ifdef TRACE_TCP
1625 	if (advertisedWindow > fSendWindow) {
1626 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1627 			advertisedWindow);
1628 	}
1629 #endif
1630 
1631 	fSendWindow = advertisedWindow;
1632 	if (advertisedWindow > fSendMaxWindow)
1633 		fSendMaxWindow = advertisedWindow;
1634 
1635 	// Then look at the acknowledgement for any updates
1636 
1637 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1638 		// process acknowledged data
1639 		if (fState == SYNCHRONIZE_RECEIVED)
1640 			_MarkEstablished();
1641 
1642 		if (fSendMax < segment.acknowledge)
1643 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1644 
1645 		if (segment.acknowledge < fSendUnacknowledged) {
1646 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1647 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1648 				TRACE("Receive(): duplicate ack!");
1649 
1650 				_DuplicateAcknowledge(segment);
1651 			}
1652 
1653 			return DROP;
1654 		} else {
1655 			// this segment acknowledges in flight data
1656 
1657 			if (fDuplicateAcknowledgeCount >= 3) {
1658 				// deflate the window.
1659 				fCongestionWindow = fSlowStartThreshold;
1660 			}
1661 
1662 			fDuplicateAcknowledgeCount = 0;
1663 
1664 			if (fSendMax == segment.acknowledge)
1665 				TRACE("Receive(): all inflight data ack'd!");
1666 
1667 			if (segment.acknowledge > fSendQueue.LastSequence()
1668 					&& fState > ESTABLISHED) {
1669 				TRACE("Receive(): FIN has been acknowledged!");
1670 
1671 				switch (fState) {
1672 					case FINISH_SENT:
1673 						fState = FINISH_ACKNOWLEDGED;
1674 						T(State(this));
1675 						break;
1676 					case CLOSING:
1677 						fState = TIME_WAIT;
1678 						T(State(this));
1679 						_EnterTimeWait();
1680 						return DROP;
1681 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1682 						_Close();
1683 						break;
1684 
1685 					default:
1686 						break;
1687 				}
1688 			}
1689 
1690 			if (fState != CLOSED)
1691 				_Acknowledged(segment);
1692 		}
1693 	}
1694 
1695 	if (segment.flags & TCP_FLAG_URGENT) {
1696 		if (fState == ESTABLISHED || fState == FINISH_SENT
1697 			|| fState == FINISH_ACKNOWLEDGED) {
1698 			// TODO: Handle urgent data:
1699 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1700 			//  - signal the user that urgent data is available (SIGURG)
1701 		}
1702 	}
1703 
1704 	bool notify = false;
1705 
1706 	// The buffer may be freed if its data is added to the queue, so cache
1707 	// the size as we still need it later.
1708 	uint32 bufferSize = buffer->size;
1709 
1710 	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1711 		&& _ShouldReceive())
1712 		notify = _AddData(segment, buffer);
1713 	else {
1714 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1715 			fReceiveNext += buffer->size;
1716 
1717 		action = (action & ~KEEP) | DROP;
1718 	}
1719 
1720 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1721 		segmentLength++;
1722 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1723 			TRACE("Receive(): peer is finishing connection!");
1724 			fReceiveNext++;
1725 			notify = true;
1726 
1727 			// FIN implies PUSH
1728 			fReceiveQueue.SetPushPointer();
1729 
1730 			// we'll reply immediately to the FIN if we are not
1731 			// transitioning to TIME WAIT so we immediatly ACK it.
1732 			action |= IMMEDIATE_ACKNOWLEDGE;
1733 
1734 			// other side is closing connection; change states
1735 			switch (fState) {
1736 				case ESTABLISHED:
1737 				case SYNCHRONIZE_RECEIVED:
1738 					fState = FINISH_RECEIVED;
1739 					T(State(this));
1740 					break;
1741 				case FINISH_SENT:
1742 					// simultaneous close
1743 					fState = CLOSING;
1744 					T(State(this));
1745 					break;
1746 				case FINISH_ACKNOWLEDGED:
1747 					fState = TIME_WAIT;
1748 					T(State(this));
1749 					_EnterTimeWait();
1750 					break;
1751 				case TIME_WAIT:
1752 					_UpdateTimeWait();
1753 					break;
1754 
1755 				default:
1756 					break;
1757 			}
1758 		}
1759 	}
1760 
1761 	if (notify)
1762 		_NotifyReader();
1763 
1764 	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1765 		action |= ACKNOWLEDGE;
1766 
1767 	_UpdateTimestamps(segment, segmentLength);
1768 
1769 	TRACE("Receive() Action %ld", action);
1770 
1771 	return action;
1772 }
1773 
1774 
1775 int32
1776 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1777 {
1778 	MutexLocker locker(fLock);
1779 
1780 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
1781 		"\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
1782 		buffer, buffer->size, PrintAddress(buffer->source),
1783 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1784 		segment.acknowledge,
1785 		(uint32)segment.advertised_window << fSendWindowShift);
1786 	T(Receive(this, segment,
1787 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1788 	int32 segmentAction = DROP;
1789 
1790 	switch (fState) {
1791 		case LISTEN:
1792 			segmentAction = _ListenReceive(segment, buffer);
1793 			break;
1794 
1795 		case SYNCHRONIZE_SENT:
1796 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1797 			break;
1798 
1799 		case SYNCHRONIZE_RECEIVED:
1800 		case ESTABLISHED:
1801 		case FINISH_RECEIVED:
1802 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1803 		case FINISH_SENT:
1804 		case FINISH_ACKNOWLEDGED:
1805 		case CLOSING:
1806 		case TIME_WAIT:
1807 		case CLOSED:
1808 			segmentAction = _Receive(segment, buffer);
1809 			break;
1810 	}
1811 
1812 	// process acknowledge action as asked for by the *Receive() method
1813 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1814 		SendAcknowledge(true);
1815 	else if (segmentAction & ACKNOWLEDGE)
1816 		DelayedAcknowledge();
1817 
1818 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1819 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1820 		locker.Unlock();
1821 		gSocketModule->release_socket(socket);
1822 	}
1823 
1824 	return segmentAction;
1825 }
1826 
1827 
1828 //	#pragma mark - send
1829 
1830 
1831 inline uint8
1832 TCPEndpoint::_CurrentFlags()
1833 {
1834 	// we don't set FLAG_FINISH here, instead we do it
1835 	// conditionally below depending if we are sending
1836 	// the last bytes of the send queue.
1837 
1838 	switch (fState) {
1839 		case CLOSED:
1840 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1841 
1842 		case SYNCHRONIZE_SENT:
1843 			return TCP_FLAG_SYNCHRONIZE;
1844 		case SYNCHRONIZE_RECEIVED:
1845 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1846 
1847 		case ESTABLISHED:
1848 		case FINISH_RECEIVED:
1849 		case FINISH_ACKNOWLEDGED:
1850 		case TIME_WAIT:
1851 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1852 		case FINISH_SENT:
1853 		case CLOSING:
1854 			return TCP_FLAG_ACKNOWLEDGE;
1855 
1856 		default:
1857 			return 0;
1858 	}
1859 }
1860 
1861 
1862 inline bool
1863 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1864 	uint32 segmentMaxSize, uint32 flightSize)
1865 {
1866 	if (length > 0) {
1867 		// Avoid the silly window syndrome - we only send a segment in case:
1868 		// - we have a full segment to send, or
1869 		// - we're at the end of our buffer queue, or
1870 		// - the buffer is at least larger than half of the maximum send window,
1871 		//   or
1872 		// - we're retransmitting data
1873 		if (length == segmentMaxSize
1874 			|| (fOptions & TCP_NODELAY) != 0
1875 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1876 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1877 			return true;
1878 	}
1879 
1880 	// check if we need to send a window update to the peer
1881 	if (segment.advertised_window > 0) {
1882 		// correct the window to take into account what already has been advertised
1883 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1884 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
1885 
1886 		// if we can advertise a window larger than twice the maximum segment
1887 		// size, or half the maximum buffer size we send a window update
1888 		if (window >= (fReceiveMaxSegmentSize << 1)
1889 			|| window >= (socket->receive.buffer_size >> 1))
1890 			return true;
1891 	}
1892 
1893 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1894 			| TCP_FLAG_RESET)) != 0)
1895 		return true;
1896 
1897 	// We do have urgent data pending
1898 	if (fSendUrgentOffset > fSendNext)
1899 		return true;
1900 
1901 	// there is no reason to send a segment just now
1902 	return false;
1903 }
1904 
1905 
1906 status_t
1907 TCPEndpoint::_SendQueued(bool force)
1908 {
1909 	return _SendQueued(force, fSendWindow);
1910 }
1911 
1912 
1913 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1914 	specific flags that need to be sent.
1915 */
1916 status_t
1917 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1918 {
1919 	if (fRoute == NULL)
1920 		return B_ERROR;
1921 
1922 	// in passive state?
1923 	if (fState == LISTEN)
1924 		return B_ERROR;
1925 
1926 	tcp_segment_header segment(_CurrentFlags());
1927 
1928 	if ((fOptions & TCP_NOOPT) == 0) {
1929 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1930 			segment.options |= TCP_HAS_TIMESTAMPS;
1931 			segment.timestamp_reply = fReceivedTimestamp;
1932 			segment.timestamp_value = tcp_now();
1933 		}
1934 
1935 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1936 			&& fSendNext == fInitialSendSequence) {
1937 			// add connection establishment options
1938 			segment.max_segment_size = fReceiveMaxSegmentSize;
1939 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1940 				segment.options |= TCP_HAS_WINDOW_SCALE;
1941 				segment.window_shift = fReceiveWindowShift;
1942 			}
1943 		}
1944 	}
1945 
1946 	size_t availableBytes = fReceiveQueue.Free();
1947 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1948 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1949 	else
1950 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1951 
1952 	segment.acknowledge = fReceiveNext.Number();
1953 
1954 	// Process urgent data
1955 	if (fSendUrgentOffset > fSendNext) {
1956 		segment.flags |= TCP_FLAG_URGENT;
1957 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
1958 	} else {
1959 		fSendUrgentOffset = fSendUnacknowledged.Number();
1960 			// Keep urgent offset updated, so that it doesn't reach into our
1961 			// send window on overlap
1962 		segment.urgent_offset = 0;
1963 	}
1964 
1965 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1966 		sendWindow = fCongestionWindow;
1967 
1968 	// fSendUnacknowledged
1969 	//  |    fSendNext      fSendMax
1970 	//  |        |              |
1971 	//  v        v              v
1972 	//  -----------------------------------
1973 	//  | effective window           |
1974 	//  -----------------------------------
1975 
1976 	// Flight size represents the window of data which is currently in the
1977 	// ether. We should never send data such as the flight size becomes larger
1978 	// than the effective window. Note however that the effective window may be
1979 	// reduced (by congestion for instance), so at some point in time flight
1980 	// size may be larger than the currently calculated window.
1981 
1982 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1983 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
1984 
1985 	if (consumedWindow > sendWindow) {
1986 		sendWindow = 0;
1987 		// TODO: enter persist state? try to get a window update.
1988 	} else
1989 		sendWindow -= consumedWindow;
1990 
1991 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1992 		// send one byte of data to ask for a window update
1993 		// (triggered by the persist timer)
1994 		sendWindow = 1;
1995 	}
1996 
1997 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1998 	tcp_sequence previousSendNext = fSendNext;
1999 
2000 	do {
2001 		uint32 segmentMaxSize = fSendMaxSegmentSize
2002 			- tcp_options_length(segment);
2003 		uint32 segmentLength = min_c(length, segmentMaxSize);
2004 
2005 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
2006 			if (state_needs_finish(fState))
2007 				segment.flags |= TCP_FLAG_FINISH;
2008 			if (length > 0)
2009 				segment.flags |= TCP_FLAG_PUSH;
2010 		}
2011 
2012 		// Determine if we should really send this segment
2013 		if (!force && !_ShouldSendSegment(segment, segmentLength,
2014 				segmentMaxSize, flightSize)) {
2015 			if (fSendQueue.Available()
2016 				&& !gStackModule->is_timer_active(&fPersistTimer)
2017 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2018 				_StartPersistTimer();
2019 			break;
2020 		}
2021 
2022 		net_buffer *buffer = gBufferModule->create(256);
2023 		if (buffer == NULL)
2024 			return B_NO_MEMORY;
2025 
2026 		status_t status = B_OK;
2027 		if (segmentLength > 0)
2028 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2029 		if (status < B_OK) {
2030 			gBufferModule->free(buffer);
2031 			return status;
2032 		}
2033 
2034 		LocalAddress().CopyTo(buffer->source);
2035 		PeerAddress().CopyTo(buffer->destination);
2036 
2037 		uint32 size = buffer->size;
2038 		segment.sequence = fSendNext.Number();
2039 
2040 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
2041 			"\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n"
2042 			"\tlen %lu first %lu last %lu",
2043 			buffer, buffer->size, PrintAddress(buffer->source),
2044 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
2045 			segment.acknowledge, segment.advertised_window,
2046 			fCongestionWindow, fSlowStartThreshold, segmentLength,
2047 			fSendQueue.FirstSequence().Number(),
2048 			fSendQueue.LastSequence().Number());
2049 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2050 			fSendQueue.LastSequence()));
2051 
2052 		PROBE(buffer, sendWindow);
2053 		sendWindow -= buffer->size;
2054 
2055 		status = add_tcp_header(AddressModule(), segment, buffer);
2056 		if (status != B_OK) {
2057 			gBufferModule->free(buffer);
2058 			return status;
2059 		}
2060 
2061 		// Update send status - we need to do this before we send the data
2062 		// for local connections as the answer is directly handled
2063 
2064 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2065 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
2066 			segment.max_segment_size = 0;
2067 			size++;
2068 		}
2069 
2070 		if (segment.flags & TCP_FLAG_FINISH)
2071 			size++;
2072 
2073 		uint32 sendMax = fSendMax.Number();
2074 		fSendNext += size;
2075 		if (fSendMax < fSendNext)
2076 			fSendMax = fSendNext;
2077 
2078 		fReceiveMaxAdvertised = fReceiveNext
2079 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
2080 
2081 		status = next->module->send_routed_data(next, fRoute, buffer);
2082 		if (status < B_OK) {
2083 			gBufferModule->free(buffer);
2084 
2085 			fSendNext = segment.sequence;
2086 			fSendMax = sendMax;
2087 				// restore send status
2088 			return status;
2089 		}
2090 
2091 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2092 			fLastAcknowledgeSent = segment.acknowledge;
2093 
2094 		length -= segmentLength;
2095 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2096 			| TCP_FLAG_FINISH);
2097 	} while (length > 0);
2098 
2099 	// if we sent data from the beggining of the send queue,
2100 	// start the retransmition timer
2101 	if (previousSendNext == fSendUnacknowledged
2102 		&& fSendNext > previousSendNext) {
2103 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
2104 			fRetransmitTimeout);
2105 
2106 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2107 	}
2108 
2109 	return B_OK;
2110 }
2111 
2112 
2113 int
2114 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2115 {
2116 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2117 }
2118 
2119 
2120 status_t
2121 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2122 {
2123 	if (fRoute == NULL) {
2124 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2125 		if (fRoute == NULL)
2126 			return ENETUNREACH;
2127 
2128 		if ((fRoute->flags & RTF_LOCAL) != 0)
2129 			fFlags |= FLAG_LOCAL;
2130 	}
2131 
2132 	// make sure connection does not already exist
2133 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2134 		fRoute->interface_address->local);
2135 	if (status < B_OK)
2136 		return status;
2137 
2138 	fInitialSendSequence = system_time() >> 4;
2139 	fSendNext = fInitialSendSequence;
2140 	fSendUnacknowledged = fInitialSendSequence;
2141 	fSendMax = fInitialSendSequence;
2142 	fSendUrgentOffset = fInitialSendSequence;
2143 
2144 	// we are counting the SYN here
2145 	fSendQueue.SetInitialSequence(fSendNext + 1);
2146 
2147 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2148 
2149 	// Compute the window shift we advertise to our peer - if it doesn't support
2150 	// this option, this will be reset to 0 (when its SYN is received)
2151 	fReceiveWindowShift = 0;
2152 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2153 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2154 		fReceiveWindowShift++;
2155 	}
2156 
2157 	return B_OK;
2158 }
2159 
2160 
2161 void
2162 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2163 {
2164 	size_t previouslyUsed = fSendQueue.Used();
2165 
2166 	fSendQueue.RemoveUntil(segment.acknowledge);
2167 	fSendUnacknowledged = segment.acknowledge;
2168 
2169 	if (fSendNext < fSendUnacknowledged)
2170 		fSendNext = fSendUnacknowledged;
2171 
2172 	if (fSendUnacknowledged == fSendMax)
2173 		gStackModule->cancel_timer(&fRetransmitTimer);
2174 
2175 	if (fSendQueue.Used() < previouslyUsed) {
2176 		// this ACK acknowledged data
2177 
2178 		if (segment.options & TCP_HAS_TIMESTAMPS)
2179 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2180 		else {
2181 			// TODO: Fallback to RFC 793 type estimation
2182 		}
2183 
2184 		if (is_writable(fState)) {
2185 			// notify threads waiting on the socket to become writable again
2186 			fSendList.Signal();
2187 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
2188 		}
2189 
2190 		if (fCongestionWindow < fSlowStartThreshold)
2191 			fCongestionWindow += fSendMaxSegmentSize;
2192 	}
2193 
2194 	if (fCongestionWindow >= fSlowStartThreshold) {
2195 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2196 
2197 		if (increment < fCongestionWindow)
2198 			increment = 1;
2199 		else
2200 			increment /= fCongestionWindow;
2201 
2202 		fCongestionWindow += increment;
2203 	}
2204 
2205 	// if there is data left to be send, send it now
2206 	if (fSendQueue.Used() > 0)
2207 		_SendQueued();
2208 }
2209 
2210 
2211 void
2212 TCPEndpoint::_Retransmit()
2213 {
2214 	TRACE("Retransmit()");
2215 	_ResetSlowStart();
2216 	fSendNext = fSendUnacknowledged;
2217 	_SendQueued();
2218 }
2219 
2220 
2221 void
2222 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2223 {
2224 	int32 rtt = roundTripTime;
2225 
2226 	// "smooth" round trip time as per Van Jacobson
2227 	rtt -= fRoundTripTime / 8;
2228 	fRoundTripTime += rtt;
2229 	if (rtt < 0)
2230 		rtt = -rtt;
2231 	rtt -= fRoundTripDeviation / 4;
2232 	fRoundTripDeviation += rtt;
2233 
2234 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2235 		* kTimestampFactor;
2236 
2237 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
2238 		roundTripTime);
2239 }
2240 
2241 
2242 void
2243 TCPEndpoint::_ResetSlowStart()
2244 {
2245 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2246 		2 * fSendMaxSegmentSize);
2247 	fCongestionWindow = fSendMaxSegmentSize;
2248 }
2249 
2250 
2251 //	#pragma mark - timer
2252 
2253 
2254 /*static*/ void
2255 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2256 {
2257 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2258 	T(Timer(endpoint, "retransmit"));
2259 
2260 	MutexLocker locker(endpoint->fLock);
2261 	if (!locker.IsLocked())
2262 		return;
2263 
2264 	endpoint->_Retransmit();
2265 }
2266 
2267 
2268 /*static*/ void
2269 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2270 {
2271 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2272 	T(Timer(endpoint, "persist"));
2273 
2274 	MutexLocker locker(endpoint->fLock);
2275 	if (!locker.IsLocked())
2276 		return;
2277 
2278 	// the timer might not have been canceled early enough
2279 	if (endpoint->State() == CLOSED)
2280 		return;
2281 
2282 	endpoint->_SendQueued(true);
2283 }
2284 
2285 
2286 /*static*/ void
2287 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2288 {
2289 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2290 	T(Timer(endpoint, "delayed ack"));
2291 
2292 	MutexLocker locker(endpoint->fLock);
2293 	if (!locker.IsLocked())
2294 		return;
2295 
2296 	// the timer might not have been canceled early enough
2297 	if (endpoint->State() == CLOSED)
2298 		return;
2299 
2300 	endpoint->SendAcknowledge(true);
2301 }
2302 
2303 
2304 /*static*/ void
2305 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2306 {
2307 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2308 	T(Timer(endpoint, "time-wait"));
2309 
2310 	MutexLocker locker(endpoint->fLock);
2311 	if (!locker.IsLocked())
2312 		return;
2313 
2314 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2315 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2316 		return;
2317 	}
2318 
2319 	locker.Unlock();
2320 
2321 	gSocketModule->release_socket(endpoint->socket);
2322 }
2323 
2324 
2325 //	#pragma mark -
2326 
2327 
2328 void
2329 TCPEndpoint::Dump() const
2330 {
2331 	kprintf("TCP endpoint %p\n", this);
2332 	kprintf("  state: %s\n", name_for_state(fState));
2333 	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2334 #if KDEBUG
2335 	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2336 #endif
2337 	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2338 	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2339 	kprintf("  send\n");
2340 	kprintf("    window shift: %u\n", fSendWindowShift);
2341 	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2342 		fSendUnacknowledged.Number());
2343 	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2344 	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2345 	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2346 	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2347 	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2348 	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2349 	kprintf("    queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
2350 #if DEBUG_BUFFER_QUEUE
2351 	fSendQueue.Dump();
2352 #endif
2353 	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2354 		fLastAcknowledgeSent.Number());
2355 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2356 		fInitialSendSequence.Number());
2357 	kprintf("  receive\n");
2358 	kprintf("    window shift: %u\n", fReceiveWindowShift);
2359 	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2360 	kprintf("    max advertised: %" B_PRIu32 "\n",
2361 		fReceiveMaxAdvertised.Number());
2362 	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2363 	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2364 	kprintf("    queue: %lu / %lu\n", fReceiveQueue.Available(),
2365 		fReceiveQueue.Size());
2366 #if DEBUG_BUFFER_QUEUE
2367 	fReceiveQueue.Dump();
2368 #endif
2369 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2370 		fInitialReceiveSequence.Number());
2371 	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2372 		fDuplicateAcknowledgeCount);
2373 	kprintf("  round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2374 		fRoundTripTime, fRoundTripDeviation);
2375 	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2376 	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2377 	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2378 }
2379 
2380